diff --git a/.cursor/rules/nemo-platform.mdc b/.cursor/rules/nemo-platform.mdc index 740b96a2..138011c5 100644 --- a/.cursor/rules/nemo-platform.mdc +++ b/.cursor/rules/nemo-platform.mdc @@ -37,9 +37,8 @@ User-facing skills in `packages/nemo_platform_ext/src/nemo_platform_ext/skills/` - `nemo-try-agent`: test a deployed agent or chat with a model. - `nemo-status`: read-only health dashboard. Run this before assuming the platform is up. - `nemo-teardown`: guided shutdown with confirmation. -- `nemo-fine-tune`: fine-tuning. Not yet available; the skill tells the user this honestly instead of letting you improvise. -Plugin-owned skills under `plugins/*/src/*/skills/` handle their own routing for guardrails, evaluations, optimization, data designer, anonymizer, and auditor. +Plugin-owned skills under `plugins/*/src/*/skills/` handle their own routing for customization, guardrails, evaluations, optimization, data designer, anonymizer, and auditor. ## Sandboxed environments diff --git a/AGENTS.md b/AGENTS.md index 60b08a90..7ca90027 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,9 +33,8 @@ User-facing skills in `packages/nemo_platform_ext/src/nemo_platform_ext/skills/` - `nemo-try-agent`: test a deployed agent or chat with a model. - `nemo-status`: read-only health dashboard. - `nemo-teardown`: guided shutdown with confirmation. -- `nemo-fine-tune`: fine-tuning. Not yet available; the skill tells the user it's not shipped instead of improvising with another training library. -Plugin-owned skills under `plugins/*/src/*/skills/` handle guardrails, evaluations, optimization, data designer, anonymizer, and auditor. +Plugin-owned skills under `plugins/*/src/*/skills/` handle their own routing for customization, guardrails, evaluations, optimization, data designer, anonymizer, and auditor. ### Working in a sandboxed environment diff --git a/CLAUDE.md b/CLAUDE.md index 9abcef4f..8554f5c6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,9 +33,8 @@ User-facing skills in `packages/nemo_platform_ext/src/nemo_platform_ext/skills/` - `nemo-try-agent`: test a deployed agent or chat with a model. - `nemo-status`: read-only health dashboard. Run this before assuming the platform is up. - `nemo-teardown`: guided shutdown with confirmation. -- `nemo-fine-tune`: fine-tuning. Not yet available; the skill tells the user it's not shipped instead of letting the agent improvise with another training library. -Plugin-owned skills live under `plugins/*/src/*/skills/` and handle their own routing for guardrails, evaluations, optimization, data designer, anonymizer, and auditor. +Plugin-owned skills live under `plugins/*/src/*/skills/` and handle their own routing for customization, guardrails, evaluations, optimization, data designer, anonymizer, and auditor. ### Working in a sandboxed coding-agent environment diff --git a/docker-bake.automodel.hcl b/docker-bake.automodel.hcl new file mode 100644 index 00000000..d51f2157 --- /dev/null +++ b/docker-bake.automodel.hcl @@ -0,0 +1,203 @@ +# nmp-automodel image bake - run from Platform repo root (context = "."). +# +# Inspect targets (no build; finishes in ~0s): +# docker buildx bake --print -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels +# +# Build wheels (override registry/tag via env, not --set): +# export WHEELS_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev +# export WHEELS_TAG=$(git rev-parse --short HEAD) +# docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels --push +# +# Build automodel images: +# docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-base-builder +# +# Published tags: nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-{base,tasks,training}: +# NVCR allows only one repo segment after the registry prefix (no nmp/automodel-base nesting). + +variable "IMAGE_REGISTRY" { + default = "nvcr.io/0921617854601259/nemo-platform-dev" +} + +variable "BASE_REGISTRY" { + default = "nvcr.io/0921617854601259/nemo-platform-dev" +} + +variable "WHEELS_REGISTRY" { + default = "nvcr.io/0921617854601259/nemo-platform-dev" +} + +variable "BAKE_TAG" { + default = "local" +} + +variable "BASE_TAG_AUTOMODEL" { + default = "local" +} + +variable "WHEELS_TAG" { + default = "3fd6986ff173b598446ffac06d9be3f84b482495" +} + +variable "CUDA_VERSION" { + default = "12.8.1" +} + +variable "MAMBA_22_COMMIT" { + default = "6b32be06d026e170b3fdaf3ae6282c5a6ff57b06" +} + +variable "MAMBA_23_COMMIT" { + default = "v2.3.0" +} + +variable "CAUSAL_CONV1D_VERSION" { + default = "v1.5.3" +} + +# For local builds: --set "*.platform=linux/amd64" +variable "BUILD_PLATFORMS" { + default = ["linux/arm64"] +} + +function "wheel_tags" { + params = [name] + result = ["${WHEELS_REGISTRY}/${name}:${WHEELS_TAG}"] +} + +function "get_causal_conv1d_wheel_image" { + params = [] + result = "${WHEELS_REGISTRY}/causal-conv1d-wheel:${WHEELS_TAG}" +} + +function "get_mamba_ssm_wheel_image" { + params = [] + result = "${WHEELS_REGISTRY}/mamba-ssm-wheel:${WHEELS_TAG}" +} + +group "nmp-automodel-gpu-wheels" { + targets = [ + "causal-conv1d-wheel", + "mamba-ssm-wheel", + ] +} + +group "nmp-automodel" { + targets = [ + "nmp-automodel-base-builder", + "nmp-automodel-tasks-docker", + "nmp-automodel-training-docker", + "nmp-automodel-tasks-smoke-test", + "nmp-automodel-training-smoke-test", + ] +} + +# Pre-built mamba-ssm / causal-conv1d wheels (cp311, cp312, cu13.1.1). Pushed to WHEELS_REGISTRY. +target "causal-conv1d-wheel" { + target = "causal-conv1d-wheel" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.mamba-wheel" + tags = wheel_tags("causal-conv1d-wheel") + args = { + CUDA_VERSION = CUDA_VERSION + CAUSAL_CONV1D_VERSION = CAUSAL_CONV1D_VERSION + } + platforms = BUILD_PLATFORMS +} + +target "mamba-ssm-wheel" { + target = "mamba-ssm-wheel" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.mamba-wheel" + tags = wheel_tags("mamba-ssm-wheel") + args = { + CUDA_VERSION = CUDA_VERSION + MAMBA_22_COMMIT = MAMBA_22_COMMIT + MAMBA_23_COMMIT = MAMBA_23_COMMIT + } + platforms = BUILD_PLATFORMS +} + +target "platform-workspace" { + target = "platform-workspace" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.platform-workspace" +} + +target "nmp-automodel-base-builder" { + target = "nmp-automodel-base-builder" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-base" + no-cache-filter = ["automodel-clone"] + tags = ["${IMAGE_REGISTRY}/nmp-automodel-base:${BAKE_TAG}"] + args = { + CAUSAL_CONV1D_WHEEL_IMAGE = get_causal_conv1d_wheel_image() + MAMBA_SSM_WHEEL_IMAGE = get_mamba_ssm_wheel_image() + } + platforms = BUILD_PLATFORMS +} + +target "nmp-automodel-tasks-docker" { + target = "runtime" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-tasks" + contexts = { + platform-workspace = "target:platform-workspace" + nmp-automodel-base = "target:nmp-automodel-base-builder" + } + tags = ["${IMAGE_REGISTRY}/nmp-automodel-tasks:${BAKE_TAG}"] + args = { + BASE_REGISTRY = BASE_REGISTRY + BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL + } + platforms = BUILD_PLATFORMS +} + +target "nmp-automodel-training-docker" { + target = "runtime" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-training" + contexts = { + platform-workspace = "target:platform-workspace" + nmp-automodel-base = "target:nmp-automodel-base-builder" + } + tags = ["${IMAGE_REGISTRY}/nmp-automodel-training:${BAKE_TAG}"] + args = { + BASE_REGISTRY = BASE_REGISTRY + BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL + } + platforms = BUILD_PLATFORMS +} + +target "nmp-automodel-tasks-smoke-test" { + target = "smoke-test" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-tasks" + contexts = { + platform-workspace = "target:platform-workspace" + nmp-automodel-base = "target:nmp-automodel-base-builder" + } + args = { + BASE_REGISTRY = BASE_REGISTRY + BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL + SMOKE_MARKER = "smoke_nmp_automodel_tasks" + } + output = ["type=cacheonly"] + platforms = BUILD_PLATFORMS +} + +target "nmp-automodel-training-smoke-test" { + target = "smoke-test" + context = "." + dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-training" + contexts = { + platform-workspace = "target:platform-workspace" + nmp-automodel-base = "target:nmp-automodel-base-builder" + } + args = { + BASE_REGISTRY = BASE_REGISTRY + BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL + SMOKE_MARKER = "smoke_nmp_automodel_training" + } + output = ["type=cacheonly"] + platforms = BUILD_PLATFORMS +} diff --git a/docs/agents/plugins.md b/docs/agents/plugins.md index 299f4727..a6765e00 100644 --- a/docs/agents/plugins.md +++ b/docs/agents/plugins.md @@ -68,7 +68,7 @@ The skills that drive the agent lifecycle are: | `agents-optimize` | Selects a deployed agent, establishes an evaluation baseline, and suggests Switchyard routing, model swaps, skill optimization, prompt tuning, and new-model evaluations. See [Optimize Agents](optimization.md). | | `agents-secure` | Selects a deployed agent, checks guardrail coverage, and scans recent telemetry for sensitive data. See [Secure Agents](security.md). | -Plugin-owned skills cover guardrails, evaluations, optimization, data +Plugin-owned skills cover customization, guardrails, evaluations, optimization, data designer, anonymizer, and auditor. They are installed with their plugin and appear in `nemo skills list` once the platform restarts. diff --git a/docs/set-up/config-reference.md b/docs/set-up/config-reference.md index 884f0298..23b189df 100644 --- a/docs/set-up/config-reference.md +++ b/docs/set-up/config-reference.md @@ -409,6 +409,8 @@ jobs: reconcile_interval_seconds: 2 # Interval in seconds for the job scheduler to run | default: 5 schedule_interval_seconds: 5 + # Register the subprocess/default execution profile. When unset, defaults to true for docker/none runtimes and false for kubernetes. + enable_subprocess_executor: ``` ### `models` diff --git a/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-fine-tune/SKILL.md b/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-fine-tune/SKILL.md deleted file mode 100644 index 690118a7..00000000 --- a/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-fine-tune/SKILL.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -name: nemo-fine-tune -description: Fine-tune a model on NeMo Platform. Not yet available; this skill describes the path for when fine-tuning ships. Use for any "train a model," "fine-tune," "customize a model," or "finetune" intent so the agent tells the user the feature isn't shipped and does not go off and implement training with some other library. -triggers: - - fine-tune - - fine tune - - finetune - - train a model - - train on my data - - customize a model - - sft a model - - model customization - - model fine-tuning -not-for: - - nemo-build-agent (use for agent scaffolding and deployment, not model training) - - nemo-explore (use for agent design conversation) - - nemo-skill-selection (use to disambiguate user intent) -compatibility: NeMo Platform any version. No prerequisites today since fine-tuning is not yet shipped. When fine-tuning lands, this skill will document Customizer plugin requirements (host-gpu mode, training data format, supported base models). -maturity: beta -license: Apache-2.0 -user-invocable: true -allowed-tools: [Read] ---- - -# Fine-tuning on NeMo Platform - -**Fine-tuning is not yet available on NeMo Platform.** Tell the user this directly. Do not run any `nemo customization` CLI commands or scaffold a fine-tuning job; the underlying functionality is not shipped. - -When fine-tuning lands, it will be delivered through a Customizer plugin that wraps NVIDIA's training stack (AutoModel, Megatron-Bridge, and related). This skill will be filled in at that point. - -## What to tell the user today - -- Fine-tuning is on the NeMo Platform roadmap and is not currently functional. Any CLI surface that looks like it should work (`nemo customization jobs ...`) is not connected to a working training backend. -- Other NeMo Platform capabilities they can use today: harden an agent (`nemo-skill-selection` → guardrails / auditor / anonymizer), evaluate an agent (`nemo-skill-selection` → evaluator), tune an agent's prompts and routing (`nemo-skill-selection` → optimization). -- If they need fine-tuning urgently, point them at upstream NVIDIA training tools (NeMo Framework, NeMo-RL, Megatron-LM) and tell them this skill will be wired up once the Customizer plugin lands. - -## Verification - -There is nothing to verify. Do not claim a fine-tuning task succeeded. If the user asks the agent to run fine-tuning anyway, refuse and explain why. - -## When fine-tuning ships - -This skill will gain pre-flight checks, a training-data preparation walkthrough, job submission, progress monitoring, and result download. Track the Customizer plugin in the NeMo Platform roadmap; this skill updates when that ships. diff --git a/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-fine-tune/tests.json b/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-fine-tune/tests.json deleted file mode 100644 index 385b0bff..00000000 --- a/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-fine-tune/tests.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "skill": "nemo-fine-tune", - "tests": [ - { - "type": "explicit", - "prompt": "Use nemo-fine-tune to start a job on my training data.", - "expected_skill": "nemo-fine-tune" - }, - { - "type": "explicit", - "prompt": "Run the fine-tune skill against my dataset at data/train.jsonl.", - "expected_skill": "nemo-fine-tune" - }, - { - "type": "explicit", - "prompt": "Open the nemo fine-tune skill and walk me through it.", - "expected_skill": "nemo-fine-tune" - }, - { - "type": "implicit", - "prompt": "I want to fine-tune a model on my own data.", - "expected_skill": "nemo-fine-tune" - }, - { - "type": "implicit", - "prompt": "Can NeMo Platform train a model for me?", - "expected_skill": "nemo-fine-tune" - }, - { - "type": "implicit", - "prompt": "I need to customize a model with SFT.", - "expected_skill": "nemo-fine-tune" - }, - { - "type": "contextual", - "prompt": "I want to optimize my agent's prompt for better accuracy.", - "expected_skill_not": "nemo-fine-tune" - }, - { - "type": "contextual", - "prompt": "Build me an agent that uses a smaller model for cheap tasks.", - "expected_skill_not": "nemo-fine-tune" - }, - { - "type": "contextual", - "prompt": "Evaluate my agent against a benchmark dataset.", - "expected_skill_not": "nemo-fine-tune" - }, - { - "type": "negative-control", - "prompt": "What's the weather in San Francisco today?", - "expected_skill_not": "nemo-fine-tune" - }, - { - "type": "negative-control", - "prompt": "Help me set up a new Postgres database on this machine.", - "expected_skill_not": "nemo-fine-tune" - }, - { - "type": "negative-control", - "prompt": "Show me the latest news about NVIDIA stock.", - "expected_skill_not": "nemo-fine-tune" - } - ] -} diff --git a/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-skill-selection/SKILL.md b/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-skill-selection/SKILL.md index 913e81ba..adc621b9 100644 --- a/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-skill-selection/SKILL.md +++ b/packages/nemo_platform_ext/src/nemo_platform_ext/skills/nemo-skill-selection/SKILL.md @@ -1,6 +1,6 @@ --- name: nemo-skill-selection -description: Top-level skill selector for any task involving NeMo Platform (NVIDIA's agent platform). Picks the right downstream skill (setup, explore, spec, build, try, status, teardown, fine-tune) from natural-language intent. Use over generic brainstorming, planning, or onboarding skills for any NeMo Platform task. +description: Top-level skill selector for any task involving NeMo Platform (NVIDIA's agent platform). Picks the right downstream skill (setup, explore, spec, build, try, status, teardown, customization training) from natural-language intent. Use over generic brainstorming, planning, or onboarding skills for any NeMo Platform task. triggers: - build an agent - create an agent @@ -48,7 +48,7 @@ Match the user's intent to one downstream skill. Pick exactly one. | "ask my agent", "try the agent", "test it" | `nemo-try-agent` | Send a query to a deployed agent or fall back to model chat | | "status", "what is running", "platform health", "is the platform up", "what's deployed", "show me what's running" | `nemo-status` | Read-only dashboard: platform, agents, providers, models | | "shut down", "stop NeMo", "tear down", "clean up" | `nemo-teardown` | Stop the cluster (keep data, delete platform data, or full cleanup) | -| "fine-tune", "customize the model", "train on my data" | `nemo-fine-tune` | Fine-tuning is not yet available on NeMo Platform. Pick this so the agent tells the user it's not shipped instead of going off to implement training with some other library. | +| "fine-tune", "customize the model", "train on my data", "SFT", "LoRA" | `nemo-customizer` | Model customization via installed customization contributor plugins (`nemo-customizer-plugin`). Requires plugin skills to be installed (`nemo skills install` / enabled-plugins). | | "optimize my agent", "make it cheaper", "reduce latency", "smaller model", "switchyard", "routing split", "compare against a newer model" | `agents-optimize` (plugin-owned, in `plugins/nemo-agents`) | Cost / latency / quality optimization for a **deployed** agent. Routing splits, skill tuning, prompt tuning, new-model scans. | | "secure my agent", "harden my agent", "check for PII", "leaked secrets", "guardrail coverage" | `agents-secure` (plugin-owned, in `plugins/nemo-agents`) | Safety and security audit for a **deployed** agent. Guardrails, PII, secrets scan. | | "evaluate my agent", "run a benchmark", "eval suite" | `nemo-evaluator` (plugin-owned, in `plugins/nemo-evaluator`) | Evaluation metrics, LLM-judge, benchmark jobs against a deployed agent or model. | @@ -104,12 +104,12 @@ NeMo Platform skills I can route to: nemo-try-agent query a deployed agent or chat with a model nemo-status read-only platform health dashboard nemo-teardown guided shutdown - nemo-fine-tune fine-tuning (not yet shipped; reports that honestly) Plugin-owned skills: agents-optimize cost / latency / quality optimization for a deployed agent agents-secure safety and security audit for a deployed agent nemo-evaluator evaluation metrics, LLM-judge, benchmark jobs + nemo-customizer fine-tuning of models guardrails content-safety middleware via virtual models auditor red-team vulnerability scanning (garak) data-designer synthetic dataset generation @@ -142,5 +142,5 @@ Do not proactively suggest Studio as the path for anything a skill already cover - **One skill at a time.** Do not load more than one downstream skill in the same turn. Each downstream skill is a full procedure with its own context budget. - **Install must happen before any skill can do useful work.** Build, try, and status all assume the platform is up. If the user has not run the CLI install (`make bootstrap` + `nemo setup`), the skills cannot work around that; hand them to `setup` for instructions. - **NeMo Platform is the product name.** Capital N, e, M, o, P. Not "nemo" or "Nemo." NAT on first mention is "NVIDIA NeMo Agent Toolkit (NAT)." -- **Fine-tuning is not yet available.** When the user asks to fine-tune, train, or customize a model, pick `nemo-fine-tune` so the agent tells the user it's not shipped instead of trying to wire up training with some other library. Do not run `nemo customization` CLI commands; the backend is not connected. +- **Model customization** goes to the `nemo-customizer` plugin skill when `nemo-customizer-plugin` (and a training backend) are installed. If that skill is not available, tell the user to enable customization plugins and install skills — do not improvise training with an external library. - **Framework honesty.** If the user describes an agent in CrewAI, AutoGen, plain LangChain, or Pydantic AI, tell them up front that NeMo Platform's optimization and evaluation surfaces operate on NAT-wrapped LangGraph agents. They will need to wrap their agent before the build path produces value. diff --git a/packages/nemo_platform_ext/tests/cli/test_app.py b/packages/nemo_platform_ext/tests/cli/test_app.py index d7e4ca47..f909ec91 100644 --- a/packages/nemo_platform_ext/tests/cli/test_app.py +++ b/packages/nemo_platform_ext/tests/cli/test_app.py @@ -51,7 +51,8 @@ def test_help_includes_getting_started(): assert "Getting started:" in result.stdout assert "nemo docs --list" in result.stdout assert "nemo services run --help" in result.stdout - assert "Set up NeMo Platform: start services, configure a provider, install skills." in result.stdout + # Help panel truncates long command descriptions; match the visible prefix. + assert "Set up NeMo Platform: start services" in result.stdout assert "--help, -h" in result.stdout assert "nemo auth login --base-url" not in result.stdout assert "nemo quickstart configure" not in result.stdout diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/authz.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/authz.py new file mode 100644 index 00000000..7cb264d7 --- /dev/null +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/authz.py @@ -0,0 +1,141 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Authorization policy contributions for NeMo Platform plugins. + +Plugins declare API routes and permissions so the auth service can authorize +requests without hand-editing ``static-authz.yaml`` for every new surface. + +Contributions are merged at runtime when the OPA bundle is built, and can be +materialized into ``static-authz.yaml`` via ``auth-tools sync-plugins``. + +Example (customization job collection):: + + from nemo_platform_plugin.authz import AuthzContribution, authz_for_workspace_job_collection + + class AutomodelContributor: + ... + def get_authz_contribution(self) -> AuthzContribution: + return authz_for_workspace_job_collection( + api_area="customization", + collection_suffix="/automodel/jobs", + permission_prefix="customization.automodel.jobs", + include_healthz=True, + healthz_suffix="/automodel/healthz", + ) +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + + +@dataclass(frozen=True) +class AuthzEndpointMethod: + """One HTTP method binding for an API route.""" + + permissions: list[str] + scopes: list[str] | None = None + + +@dataclass +class AuthzContribution: + """Authorization data contributed by a plugin.""" + + permissions: dict[str, str] = field(default_factory=dict) + """Flat registry entries: ``permission_id`` → human-readable description.""" + + endpoints: dict[str, dict[str, AuthzEndpointMethod]] = field(default_factory=dict) + """Full API paths (``/apis/...``) → lower-case HTTP method → spec.""" + + role_permissions: dict[str, list[str]] = field(default_factory=dict) + """Optional explicit role → permission grants (merged with defaults).""" + + def to_dict(self) -> dict[str, Any]: + """Serialize for :func:`nmp.common.auth.authz_merge.merge_authz_contributions`.""" + return { + "permissions": dict(self.permissions), + "endpoints": { + path: { + method: { + "permissions": spec.permissions, + **({"scopes": spec.scopes} if spec.scopes is not None else {}), + } + for method, spec in methods.items() + } + for path, methods in self.endpoints.items() + }, + "role_permissions": {role: list(perms) for role, perms in self.role_permissions.items()}, + } + + +def _scopes_for(api_area: str, write: bool) -> list[str]: + verb = "write" if write else "read" + return [f"{api_area}:{verb}", f"platform:{verb}"] + + +def _job_collection_permissions(permission_prefix: str) -> dict[str, str]: + return { + f"{permission_prefix}.create": f"Create {permission_prefix} jobs", + f"{permission_prefix}.list": f"List {permission_prefix} jobs", + f"{permission_prefix}.read": f"Read {permission_prefix} jobs", + f"{permission_prefix}.delete": f"Delete {permission_prefix} jobs", + } + + +def authz_for_workspace_job_collection( + api_area: str, + collection_suffix: str, + permission_prefix: str, + include_healthz: bool = False, + healthz_suffix: str | None = None, +) -> AuthzContribution: + """Build authz for standard CORE job routes under ``/apis//v2/workspaces/{workspace}...``. + + Args: + api_area: URL segment after ``/apis/`` (e.g. ``customization``, ``safe-synthesizer``). + collection_suffix: Path after workspace (e.g. ``/automodel/jobs`` or ``/jobs``). + permission_prefix: Dot-separated permission namespace (e.g. ``customization.automodel.jobs``). + include_healthz: When true, register GET healthz with empty permissions (authenticated only). + healthz_suffix: Defaults to ``{first segment of collection_suffix}/healthz`` when omitted. + """ + if not collection_suffix.startswith("/"): + raise ValueError("collection_suffix must start with '/'") + base = f"/apis/{api_area}/v2/workspaces/{{workspace}}{collection_suffix}" + perms = _job_collection_permissions(permission_prefix) + prefix = permission_prefix + endpoints: dict[str, dict[str, AuthzEndpointMethod]] = { + base: { + "post": AuthzEndpointMethod( + permissions=[f"{prefix}.create"], + scopes=_scopes_for(api_area, write=True), + ), + "get": AuthzEndpointMethod( + permissions=[f"{prefix}.list"], + scopes=_scopes_for(api_area, write=False), + ), + }, + f"{base}/{{name}}": { + "get": AuthzEndpointMethod( + permissions=[f"{prefix}.read"], + scopes=_scopes_for(api_area, write=False), + ), + "delete": AuthzEndpointMethod( + permissions=[f"{prefix}.delete"], + scopes=_scopes_for(api_area, write=True), + ), + }, + } + if include_healthz: + if healthz_suffix is None: + first = collection_suffix.strip("/").split("/")[0] + healthz_suffix = f"/{first}/healthz" + if not healthz_suffix.startswith("/"): + healthz_suffix = f"/{healthz_suffix}" + health_path = f"/apis/{api_area}/v2/workspaces/{{workspace}}{healthz_suffix}" + endpoints[health_path] = { + "get": AuthzEndpointMethod(permissions=[], scopes=[]), + } + + return AuthzContribution(permissions=perms, endpoints=endpoints) diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/authz_discovery.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/authz_discovery.py new file mode 100644 index 00000000..54837f86 --- /dev/null +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/authz_discovery.py @@ -0,0 +1,169 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Discover plugin authorization contributions for policy merge.""" + +from __future__ import annotations + +import inspect +import logging +from functools import cache +from typing import Any, Callable + +from nemo_platform_plugin.authz import AuthzContribution + +logger = logging.getLogger(__name__) + +AUTHZ_GROUP = "nemo.authz" + +AuthzContributor = Callable[[], AuthzContribution] | type[Any] + + +def _load_authz_contribution(loaded: AuthzContributor, source: str) -> AuthzContribution | None: + try: + if isinstance(loaded, type): + target = loaded + if hasattr(target, "get_authz_contribution"): + result = target.get_authz_contribution() + else: + instance = target() + result = instance.get_authz_contribution() if hasattr(instance, "get_authz_contribution") else None + elif callable(loaded): + result = loaded() + else: + logger.warning("Authz entry %s is not callable or a class — skipping", source) + return None + except Exception: + logger.warning("Failed to load authz contribution from %s — skipping", source, exc_info=True) + return None + + if result is None: + return None + if isinstance(result, AuthzContribution): + return result + if isinstance(result, dict): + return AuthzContribution( + permissions=result.get("permissions") or {}, + endpoints={ + path: {method: _method_from_dict(spec) for method, spec in methods.items() if isinstance(spec, dict)} + for path, methods in (result.get("endpoints") or {}).items() + if isinstance(methods, dict) + }, + role_permissions=result.get("role_permissions") or {}, + ) + logger.warning("Authz contribution from %s has unexpected type %r — skipping", source, type(result)) + return None + + +def _invoke_get_authz_contribution(item: Any) -> Any: + """Call ``get_authz_contribution`` on a service class or contributor instance.""" + getter = getattr(item, "get_authz_contribution", None) + if not callable(getter): + return None + if isinstance(item, type): + # discover_services() yields classes — must be @classmethod on NemoService. + return getter() + return getter() + + +def _method_from_dict(spec: dict[str, Any]) -> Any: + from nemo_platform_plugin.authz import AuthzEndpointMethod + + return AuthzEndpointMethod( + permissions=list(spec.get("permissions") or []), + scopes=list(spec["scopes"]) if spec.get("scopes") is not None else None, + ) + + +def _collect_from_plugin_surface( + items: dict[str, Any], + surface: str, +) -> list[AuthzContribution]: + contributions: list[AuthzContribution] = [] + for key, item in items.items(): + if not hasattr(item, "get_authz_contribution"): + continue + if isinstance(item, type): + method = inspect.getattr_static(item, "get_authz_contribution", None) + if method is None or not isinstance(method, classmethod): + # Only classmethods are valid on NemoService subclasses (no instance). + continue + try: + result = _invoke_get_authz_contribution(item) + except TypeError as exc: + logger.warning( + "Authz on %s %r must be a @classmethod (discover_services loads classes): %s", + surface, + key, + exc, + ) + continue + except Exception: + logger.warning( + "Failed to get authz contribution from %s %r — skipping", + surface, + key, + exc_info=True, + ) + continue + if result is None: + continue + if isinstance(result, AuthzContribution): + contributions.append(result) + elif isinstance(result, dict): + loaded = _load_authz_contribution(lambda: result, source=f"{surface}:{key}") + if loaded is not None: + contributions.append(loaded) + else: + logger.warning( + "Authz contribution from %s %r has unexpected type %r — skipping", + surface, + key, + type(result), + ) + return contributions + + +@cache +def discover_authz_contributions() -> list[AuthzContribution]: + """Collect authz contributions from entry points and plugin surfaces. + + Sources (in order): + + 1. ``nemo.authz`` entry points (callable or class) + 2. ``nemo.services`` classes implementing :meth:`get_authz_contribution` + 3. ``nemo.customization.contributors`` instances implementing the same + """ + from nemo_platform_plugin.discovery import ( + CUSTOMIZATION_CONTRIBUTORS_GROUP, + discover_customization_contributors, + discover_entry_points, + discover_services, + ) + + contributions: list[AuthzContribution] = [] + + for ep_name, ep in discover_entry_points(AUTHZ_GROUP).items(): + try: + loaded = ep.load() + contrib = _load_authz_contribution(loaded, source=f"nemo.authz:{ep_name}") + if contrib is not None: + contributions.append(contrib) + logger.debug("Loaded authz contribution from nemo.authz:%s", ep_name) + except Exception: + logger.warning("Failed to load nemo.authz entry %r — skipping", ep_name, exc_info=True) + + contributions.extend(_collect_from_plugin_surface(discover_services(), surface="nemo.services")) + contributions.extend( + _collect_from_plugin_surface( + discover_customization_contributors(), + surface=CUSTOMIZATION_CONTRIBUTORS_GROUP, + ) + ) + + return contributions + + +def discover_authz_contribution_dicts() -> list[dict[str, Any]]: + """Return contributions as dicts for :func:`nmp.common.auth.authz_merge.merge_authz_contributions`.""" + return [c.to_dict() for c in discover_authz_contributions()] diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/commands.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/commands.py index 98cf4051..93b21e88 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/commands.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/commands.py @@ -584,6 +584,7 @@ def _do_submit() -> Any: workspace=workspace, profile=profile, options=merged_options or None, + headers=_resolve_submit_auth_headers(typer_ctx) or None, ) renderer: CLIRenderer | None = None @@ -1088,6 +1089,22 @@ def _resolve_local_cli_sdks( return sdk, async_sdk +def _resolve_submit_auth_headers(typer_ctx: typer.Context) -> dict[str, str]: + """Bearer (and other) default headers from the active CLI context.""" + state = typer_ctx.obj + if state is None or not hasattr(state, "get_sdk_context"): + return {} + try: + ctx = state.get_sdk_context() + client_config = ctx.user.get_client_config() + headers = client_config.get("default_headers") + if isinstance(headers, dict): + return {str(k): str(v) for k, v in headers.items()} + except Exception: + return {} + return {} + + # ---- submit ------------------------------------------------------ # @@ -1130,7 +1147,7 @@ def _submit(typer_ctx: typer.Context, **kwargs: object) -> None: cluster=cluster, workspace=workspace, ) - headers: dict[str, str] = {} + headers = _resolve_submit_auth_headers(typer_ctx) if request_id is not None: headers["X-Request-ID"] = request_id diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/customization_contributor.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/customization_contributor.py new file mode 100644 index 00000000..71092596 --- /dev/null +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/customization_contributor.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Contributor protocol for customization training backends.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, ClassVar, Protocol, runtime_checkable + +import typer + +if TYPE_CHECKING: + from nemo_platform_plugin.authz import AuthzContribution + from nemo_platform_plugin.service import RouterSpec + + +@runtime_checkable +class CustomizationContributor(Protocol): + """One training backend mounted under ``/apis/customization``.""" + + name: ClassVar[str] + dependencies: ClassVar[list[str]] + + def get_routers(self) -> list[RouterSpec]: + """HTTP routes for this backend (workspace-scoped prefix per backend).""" + + def get_cli(self) -> typer.Typer | None: + """CLI subgroup mounted at ``nemo customization ``.""" + + def get_authz_contribution(self) -> AuthzContribution | None: + """Optional authorization policy (endpoints + permissions) for this contributor. + + Implement to return :class:`~nemo_platform_plugin.authz.AuthzContribution`, or + register a ``nemo.authz`` entry point instead. + """ + ... diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py index 60eb440b..83d8a965 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py @@ -22,7 +22,9 @@ ``nemo.docs`` → :func:`discover_docs` — ``() -> Path | dict`` callable ``nemo.executors`` → :func:`discover_executors` — ``Executor`` class ``nemo.inference_middleware`` → :func:`discover_inference_middleware` — :class:`~nemo_platform_plugin.inference_middleware.NemoInferenceMiddleware` subclass (typed, IGW instantiates) +``nemo.customization.contributors`` → :func:`discover_customization_contributors` — :class:`~nemo_platform_plugin.customization_contributor.CustomizationContributor` instance (typed, customization router instantiates) ``nemo.seed`` → :func:`discover_seed_jobs` — :class:`~nemo_platform_plugin.seed.NemoSeedJob` subclass (typed, platform instantiates) +``nemo.authz`` → :func:`~nemo_platform_plugin.authz_discovery.discover_authz_contributions` — policy endpoints/permissions (merged at runtime and via ``auth-tools sync-plugins``) Wrappers for surfaces whose types are not yet defined in this package return ``dict[str, Any]`` — callers cast as needed. @@ -50,6 +52,7 @@ if TYPE_CHECKING: from nemo_platform_plugin.cli import NemoCLI from nemo_platform_plugin.controller import NemoController + from nemo_platform_plugin.customization_contributor import CustomizationContributor from nemo_platform_plugin.function import NemoFunction from nemo_platform_plugin.inference_middleware import NemoInferenceMiddleware from nemo_platform_plugin.job import NemoJob @@ -73,7 +76,9 @@ "nemo.docs", "nemo.executors", "nemo.inference_middleware", + "nemo.customization.contributors", "nemo.seed", + "nemo.authz", ) # Surface groups whose entry-point keys are dot-separated as @@ -95,9 +100,13 @@ "nemo.docs": "NEMO_PLUGIN_DOCS_ALLOWLIST", "nemo.executors": "NEMO_PLUGIN_EXECUTORS_ALLOWLIST", "nemo.inference_middleware": "NEMO_PLUGIN_INFERENCE_MIDDLEWARE_ALLOWLIST", + "nemo.customization.contributors": "NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST", "nemo.seed": "NEMO_PLUGIN_SEED_ALLOWLIST", + "nemo.authz": "NEMO_PLUGIN_AUTHZ_ALLOWLIST", } +CUSTOMIZATION_CONTRIBUTORS_GROUP = "nemo.customization.contributors" + def _manifest_plugin_name(group: str, entry_point_name: str) -> str: if group in _DOT_SCOPED_GROUPS: @@ -468,6 +477,70 @@ def discover_executors() -> dict[str, Any]: return discover("nemo.executors") +def _instantiate_customization_contributor(loaded: object) -> CustomizationContributor: + from nemo_platform_plugin.customization_contributor import CustomizationContributor + + if isinstance(loaded, type): + instance = loaded() + else: + instance = loaded + if not isinstance(instance, CustomizationContributor): + raise TypeError( + f"Expected CustomizationContributor instance, got {type(instance)!r}", + ) + return instance + + +@cache +def discover_customization_contributors() -> dict[str, CustomizationContributor]: + """Typed wrapper: discover ``nemo.customization.contributors`` entry-points. + + Returns a dict keyed by entry-point key (e.g. ``"automodel"``) mapping to a + :class:`~nemo_platform_plugin.customization_contributor.CustomizationContributor` + instance. Entry points may register a class (instantiated here) or a pre-built + instance. Broken contributors are skipped with a warning (same fault isolation as + :func:`discover`). + """ + + result: dict[str, CustomizationContributor] = {} + + for ep in discover_entry_points(CUSTOMIZATION_CONTRIBUTORS_GROUP).values(): + try: + loaded = ep.load() + contributor = _instantiate_customization_contributor(loaded) + key = getattr(type(contributor), "name", None) or ep.name + if key != ep.name: + logger.warning( + "Contributor entry-point key %r differs from class name %r; using entry-point key", + ep.name, + key, + ) + result[ep.name] = contributor + logger.debug( + "Loaded customization contributor %r from %s", + ep.name, + ep.value, + ) + except Exception: + logger.warning( + "Failed to load customization contributor %r (%s) — skipping", + ep.name, + ep.value, + exc_info=True, + ) + + return result + + +def discover_customization_contributor_classes() -> dict[str, type]: + """Return contributor entry-point name → loaded class (for tests).""" + result: dict[str, type] = {} + for key, loaded in discover(CUSTOMIZATION_CONTRIBUTORS_GROUP).items(): + if isinstance(loaded, type): + result[key] = loaded + return result + + def discover_inference_middleware() -> dict[str, type[NemoInferenceMiddleware]]: """Typed wrapper: discover ``nemo.inference_middleware`` entry-points. diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/scheduler.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/scheduler.py index 9d505617..3c5828f4 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/scheduler.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/scheduler.py @@ -151,6 +151,7 @@ def submit_remote( options: dict | None = None, metadata: dict | None = None, http_client: httpx.Client | None = None, + headers: dict[str, str] | None = None, timeout: float = 30.0, ) -> dict: """POST the job to the plugin service's per-job submit route. @@ -173,6 +174,8 @@ def submit_remote( http_client: Optional injected :class:`httpx.Client`. Defaults to a short-lived client per call; tests supply a mock transport. + headers: Optional per-request headers (e.g. ``Authorization`` from + the CLI). Merged on each POST; not inferred from *http_client*. timeout: Request timeout in seconds. Returns: @@ -184,7 +187,7 @@ def submit_remote( """ url = self._build_submit_url(job_cls, base_url=base_url, workspace=workspace) body = self._build_submit_body(spec, profile=profile, options=options, metadata=metadata) - return self._post_submit(url, body, http_client=http_client, timeout=timeout) + return self._post_submit(url, body, http_client=http_client, headers=headers, timeout=timeout) # ------------------------------------------------------------------ # # Schema discovery # @@ -373,6 +376,7 @@ def _post_submit( body: dict[str, Any], *, http_client: httpx.Client | None, + headers: dict[str, str] | None, timeout: float, ) -> dict: """POST *body* to *url* and return the decoded JSON response. @@ -380,12 +384,13 @@ def _post_submit( Uses *http_client* when provided; otherwise opens a short-lived client per call. """ + request_headers = dict(headers) if headers else None logger.debug("submit_remote POST %s", url) if http_client is not None: - response = http_client.post(url, json=body, timeout=timeout) + response = http_client.post(url, json=body, headers=request_headers, timeout=timeout) else: with httpx.Client(timeout=timeout) as client: - response = client.post(url, json=body) + response = client.post(url, json=body, headers=request_headers) response.raise_for_status() return response.json() diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/service.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/service.py index 84430fe4..ea253fdf 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/service.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/service.py @@ -115,6 +115,17 @@ async def on_shutdown(self) -> None: The default implementation does nothing. """ + @classmethod + def get_authz_contribution(cls) -> object | None: + """Optional authorization policy for routes under ``/apis//``. + + Override as a **classmethod** on the :class:`NemoService` subclass (``discover_services`` + loads classes, not instances). Return + :class:`~nemo_platform_plugin.authz.AuthzContribution` or register a ``nemo.authz`` + entry point. Default: no plugin-specific authz. + """ + return None + def get_exception_handlers(self) -> dict[type[Exception], ExceptionHandler]: """Return a mapping of exception types to handler functions. diff --git a/packages/nemo_platform_plugin/tests/test_authz.py b/packages/nemo_platform_plugin/tests/test_authz.py new file mode 100644 index 00000000..c7cfc74e --- /dev/null +++ b/packages/nemo_platform_plugin/tests/test_authz.py @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest +from nemo_platform_plugin.authz import AuthzContribution, authz_for_workspace_job_collection +from nemo_platform_plugin.authz_discovery import _collect_from_plugin_surface, discover_authz_contributions +from nemo_platform_plugin.service import NemoService + + +def test_authz_for_workspace_job_collection_paths() -> None: + contrib = authz_for_workspace_job_collection( + api_area="customization", + collection_suffix="/automodel/jobs", + permission_prefix="customization.automodel.jobs", + include_healthz=True, + healthz_suffix="/automodel/healthz", + ) + assert "/apis/customization/v2/workspaces/{workspace}/automodel/jobs" in contrib.endpoints + post = contrib.endpoints["/apis/customization/v2/workspaces/{workspace}/automodel/jobs"]["post"] + assert post.permissions == ["customization.automodel.jobs.create"] + assert "customization:write" in (post.scopes or []) + assert "customization.automodel.jobs.create" in contrib.permissions + + +def test_service_class_get_authz_contribution_without_instance() -> None: + """discover_services yields classes; get_authz_contribution must be a classmethod.""" + + class _Svc(NemoService): + name = "example-svc" + dependencies = [] + + @classmethod + def get_authz_contribution(cls) -> AuthzContribution: + return authz_for_workspace_job_collection( + api_area="example-svc", + collection_suffix="/jobs", + permission_prefix="example-svc.jobs", + ) + + def get_routers(self): + return [] + + contribs = _collect_from_plugin_surface({"example-svc": _Svc}, surface="nemo.services") + assert len(contribs) == 1 + assert "/apis/example-svc/v2/workspaces/{workspace}/jobs" in contribs[0].endpoints + + +def test_discover_includes_automodel_when_installed(monkeypatch: pytest.MonkeyPatch) -> None: + """When nemo-automodel-plugin is installed, its contributor authz is discovered.""" + try: + from nemo_automodel_plugin.contributor import AutomodelContributor + except ImportError: + pytest.skip("nemo-automodel-plugin not installed") + + monkeypatch.setattr( + "nemo_platform_plugin.discovery.discover_customization_contributors", + lambda: {"automodel": AutomodelContributor()}, + ) + monkeypatch.setattr( + "nemo_platform_plugin.discovery.discover_services", + lambda: {}, + ) + monkeypatch.setattr( + "nemo_platform_plugin.discovery.discover_entry_points", + lambda _group: {}, + ) + discover_authz_contributions.cache_clear() + try: + contributions = discover_authz_contributions() + finally: + discover_authz_contributions.cache_clear() + + assert len(contributions) >= 1 + paths = set() + for contrib in contributions: + paths.update(contrib.endpoints.keys()) + assert "/apis/customization/v2/workspaces/{workspace}/automodel/jobs" in paths diff --git a/packages/nemo_platform_plugin/tests/test_commands.py b/packages/nemo_platform_plugin/tests/test_commands.py index 58453bb9..be4a4025 100644 --- a/packages/nemo_platform_plugin/tests/test_commands.py +++ b/packages/nemo_platform_plugin/tests/test_commands.py @@ -319,6 +319,35 @@ def test_submit_accepts_profile_and_cluster_flags(self) -> None: assert "--profile" in output assert "--cluster" in output + def test_submit_passes_cli_auth_headers(self, monkeypatch) -> None: + captured: dict[str, object] = {} + + def _capture(_self, _job_cls, _spec, headers=None, **_kwargs) -> dict: + captured["headers"] = headers + return {"id": "job-123"} + + class _State: + def get_sdk_context(self) -> SimpleNamespace: + return SimpleNamespace( + user=SimpleNamespace( + get_client_config=lambda: { + "default_headers": {"Authorization": "Bearer test-token"}, + } + ) + ) + + monkeypatch.setattr("nemo_platform_plugin.scheduler.NemoJobScheduler.submit_remote", _capture) + + app = _app_with_jobs(_GreetJob) + result = runner.invoke( + app, + ["greet", "submit", "--base-url", "http://127.0.0.1:8080"], + obj=_State(), + ) + + assert result.exit_code == 0, result.output + assert captured["headers"] == {"Authorization": "Bearer test-token"} + # --------------------------------------------------------------------------- # explain verb — phase 1 MR 1.2c stubs diff --git a/packages/nemo_platform_plugin/tests/test_discovery.py b/packages/nemo_platform_plugin/tests/test_discovery.py index 3eda7da6..ac629d00 100644 --- a/packages/nemo_platform_plugin/tests/test_discovery.py +++ b/packages/nemo_platform_plugin/tests/test_discovery.py @@ -13,8 +13,10 @@ from nemo_platform_plugin.cli import NemoCLI from nemo_platform_plugin.discovery import ( _ALL_SURFACE_GROUPS, + CUSTOMIZATION_CONTRIBUTORS_GROUP, discover, discover_cli, + discover_customization_contributors, discover_entry_points, discover_functions, discover_jobs, @@ -40,10 +42,12 @@ def clear_discovery_cache(): discover_entry_points.cache_clear() discover.cache_clear() discover_manifests.cache_clear() + discover_customization_contributors.cache_clear() yield discover_entry_points.cache_clear() discover.cache_clear() discover_manifests.cache_clear() + discover_customization_contributors.cache_clear() # --------------------------------------------------------------------------- @@ -563,3 +567,55 @@ def test_function_only_plugins_use_plugin_name_not_function_name(self) -> None: result = discover_manifests() assert list(result.keys()) == ["example"] assert result["example"].version == "1.2.3" + + +class TestDiscoverCustomizationContributors: + def test_group_in_all_surface_groups(self) -> None: + assert CUSTOMIZATION_CONTRIBUTORS_GROUP in _ALL_SURFACE_GROUPS + + def test_uses_customization_contributors_group(self) -> None: + with patch("nemo_platform_plugin.discovery.entry_points", return_value=[]) as mock_eps: + discover_customization_contributors() + mock_eps.assert_called_once_with(group=CUSTOMIZATION_CONTRIBUTORS_GROUP) + + def test_instantiates_contributor_class(self) -> None: + class _Contributor: + name = "fake" + dependencies = ["jobs"] + + def get_routers(self) -> list[RouterSpec]: + return [] + + def get_cli(self) -> None: + return None + + def get_authz_contribution(self): + return None + + ep = _make_ep("fake", _Contributor) + with patch("nemo_platform_plugin.discovery.entry_points", return_value=[ep]): + result = discover_customization_contributors() + assert isinstance(result["fake"], _Contributor) + + def test_failing_contributor_is_skipped(self) -> None: + bad = _make_ep("bad", None) + bad.load.side_effect = RuntimeError("broken") + + class _Contributor: + name = "good" + dependencies = ["jobs"] + + def get_routers(self) -> list[RouterSpec]: + return [] + + def get_cli(self) -> None: + return None + + def get_authz_contribution(self): + return None + + good = _make_ep("good", _Contributor) + with patch("nemo_platform_plugin.discovery.entry_points", return_value=[bad, good]): + result = discover_customization_contributors() + assert "bad" not in result + assert "good" in result diff --git a/packages/nmp_common/src/nmp/common/auth/authz_merge.py b/packages/nmp_common/src/nmp/common/auth/authz_merge.py new file mode 100644 index 00000000..f45184c1 --- /dev/null +++ b/packages/nmp_common/src/nmp/common/auth/authz_merge.py @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Merge plugin-contributed authorization data into static policy data.""" + +from __future__ import annotations + +import copy +from typing import Any + + +def _deep_merge_permission_registry(base: dict[str, Any], overlay: dict[str, Any]) -> dict[str, Any]: + """Merge nested permission registry trees (leaf nodes have ``description``).""" + merged = copy.deepcopy(base) + for key, value in overlay.items(): + if key not in merged: + merged[key] = copy.deepcopy(value) + continue + if isinstance(merged[key], dict) and isinstance(value, dict): + if "description" in value or "description" in merged[key]: + # Leaf or partial leaf — overlay wins at this key when overlay is a leaf + if "description" in value: + merged[key] = copy.deepcopy(value) + else: + merged[key] = _deep_merge_permission_registry(merged[key], value) + else: + merged[key] = _deep_merge_permission_registry(merged[key], value) + else: + merged[key] = copy.deepcopy(value) + return merged + + +def _permission_id_to_nested(permission_id: str, description: str) -> dict[str, Any]: + """Turn ``customization.automodel.jobs.create`` into nested registry dict.""" + parts = permission_id.split(".") + node: dict[str, Any] = {} + cursor = node + for part in parts[:-1]: + cursor[part] = {} + cursor = cursor[part] + cursor[parts[-1]] = {"description": description} + return node + + +def _merge_flat_permissions( + registry: dict[str, Any], + flat_permissions: dict[str, str], +) -> dict[str, Any]: + merged = registry + for perm_id, description in flat_permissions.items(): + nested = _permission_id_to_nested(perm_id, description) + merged = _deep_merge_permission_registry(merged, nested) + return merged + + +def _default_roles_for_permission(permission_id: str) -> list[str]: + """Mirror ``auth-tools update`` role assignment heuristics.""" + suffix = permission_id.rsplit(".", 1)[-1] + if suffix in {"list", "read"}: + return ["Viewer", "Editor"] + return ["Editor"] + + +def merge_authz_contributions( + base_data: dict[str, Any], + contributions: list[dict[str, Any]], +) -> dict[str, Any]: + """Merge plugin :class:`AuthzContribution` payloads into loaded static authz data. + + Each contribution dict may contain: + + - ``permissions``: flat ``permission_id -> description`` for the registry + - ``endpoints``: ``path -> {method: {permissions, scopes?}}`` + - ``role_permissions``: optional ``role -> [permission_id, ...]`` extra grants + (defaults: ``.list``/``.read`` → Viewer+Editor, else Editor only) + + Later contributions override endpoint methods for the same path+method. + """ + if not contributions: + return base_data + + merged = copy.deepcopy(base_data) + authz = merged.setdefault("authz", {}) + registry = authz.setdefault("permissions", {}) + endpoints = authz.setdefault("endpoints", {}) + roles = authz.setdefault("roles", {}) + + auto_role_grants: dict[str, set[str]] = {} + + for contribution in contributions: + flat_permissions = contribution.get("permissions") or {} + if isinstance(flat_permissions, dict): + registry = _merge_flat_permissions(registry, flat_permissions) + + contrib_endpoints = contribution.get("endpoints") or {} + if isinstance(contrib_endpoints, dict): + for path, methods in contrib_endpoints.items(): + if not isinstance(methods, dict): + continue + endpoints.setdefault(path, {}) + for method, spec in methods.items(): + if isinstance(spec, dict): + endpoints[path][method.lower()] = copy.deepcopy(spec) + + explicit_roles = contribution.get("role_permissions") or {} + if isinstance(explicit_roles, dict): + for role_name, perms in explicit_roles.items(): + if not isinstance(perms, list): + continue + auto_role_grants.setdefault(role_name, set()).update(str(p) for p in perms) + + for perm_id in flat_permissions: + for role_name in _default_roles_for_permission(perm_id): + auto_role_grants.setdefault(role_name, set()).add(perm_id) + + authz["permissions"] = registry + + for role_name, perm_ids in auto_role_grants.items(): + role_cfg = roles.setdefault(role_name, {"permissions": []}) + existing = role_cfg.setdefault("permissions", []) + if not isinstance(existing, list): + continue + for perm_id in sorted(perm_ids): + if perm_id not in existing: + existing.append(perm_id) + + return merged diff --git a/packages/nmp_common/tests/auth/test_authz_merge.py b/packages/nmp_common/tests/auth/test_authz_merge.py new file mode 100644 index 00000000..56f9e190 --- /dev/null +++ b/packages/nmp_common/tests/auth/test_authz_merge.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from nmp.common.auth.authz_format import validate_static_authz_data +from nmp.common.auth.authz_merge import merge_authz_contributions + + +def test_merge_adds_endpoints_and_role_permissions() -> None: + base = { + "authz": { + "permissions": {}, + "roles": {"Editor": {"permissions": ["jobs.read"]}, "Viewer": {"permissions": []}}, + "endpoints": {}, + } + } + overlay = [ + { + "permissions": { + "customization.automodel.jobs.create": "Create automodel jobs", + "customization.automodel.jobs.read": "Read automodel jobs", + }, + "endpoints": { + "/apis/customization/v2/workspaces/{workspace}/automodel/jobs": { + "post": { + "permissions": ["customization.automodel.jobs.create"], + "scopes": ["customization:write", "platform:write"], + }, + }, + }, + } + ] + merged = merge_authz_contributions(base, overlay) + validate_static_authz_data(merged) + endpoints = merged["authz"]["endpoints"] + assert "post" in endpoints["/apis/customization/v2/workspaces/{workspace}/automodel/jobs"] + editor_perms = merged["authz"]["roles"]["Editor"]["permissions"] + assert "customization.automodel.jobs.create" in editor_perms + viewer_perms = merged["authz"]["roles"]["Viewer"]["permissions"] + assert "customization.automodel.jobs.read" in viewer_perms diff --git a/packages/nmp_platform/README.md b/packages/nmp_platform/README.md index aa65b3c0..c819acd7 100644 --- a/packages/nmp_platform/README.md +++ b/packages/nmp_platform/README.md @@ -62,3 +62,11 @@ tests/test_main.py The `config/` files (`local.yaml`, `local.env`) are not Python — they are the default config consumed by `nemo services run` during local development and referenced from several Makefiles and run scripts in the repo. + +`local.env` sets SQLite for the entity store (`~/.local/share/nemo/nmp-platform.db`) +so no PostgreSQL is required. Source it before starting services: + +```bash +set -a && source packages/nmp_platform/config/local.env && set +a +uv run nemo services run --host 127.0.0.1 --port 8080 +``` diff --git a/packages/nmp_platform/config/local.env b/packages/nmp_platform/config/local.env index 2cb27c2b..dd00798e 100644 --- a/packages/nmp_platform/config/local.env +++ b/packages/nmp_platform/config/local.env @@ -1,20 +1,27 @@ -# Environment variables for local development - NO external dependencies -# -# This env file runs the platform using SQLite and noop secrets. -# No PostgreSQL, OpenBao, or other external services required. +# Environment variables for local development — no external dependencies. # -# Usage: set -a && source packages/nmp_platform/config/local.env && set +a +# Entity store uses SQLite below. No PostgreSQL, OpenBao, or other services required. +# +# Usage (from repo root): +# set -a && source packages/nmp_platform/config/local.env && set +a +# uv run nemo services run --host 127.0.0.1 --port 8080 +# +# CLI against a remote platform (no local nemo services run): +# export NMP_BASE_URL=http://10.0.0.51:8080 +# nemo auth login --unsigned-token --email you@example.com +# +# Reset local DB + files: stop the platform, then rm -rf ~/.local/share/nemo + +# Platform API for `nemo` CLI (overrides ~/.config/nmp/config.yaml when this file is sourced) +NMP_BASE_URL=http://10.0.0.51:8080 +NEMO_BASE_URL=http://10.0.0.51:8080 # Config file NMP_CONFIG_FILE_PATH=packages/nmp_platform/config/local.yaml -# Database connections (postgres exposed at localhost:5432) -DATABASE_HOST=localhost -DATABASE_PORT=5432 -DATABASE_USER=nmp -DATABASE_PASSWORD=nmp -DATABASE_NAME=nmp -DATABASE_DIALECT=postgresql +# Entity store (SQLite; parent dir is created on first platform start) +DATABASE_DIALECT=sqlite +DATABASE_PATH="${HOME}/.local/share/nemo/nmp-platform.db" # Docker configuration for jobs (connect to host Docker socket directly) DOCKER_HOST=unix:///var/run/docker.sock diff --git a/packages/nmp_platform/config/local.yaml b/packages/nmp_platform/config/local.yaml index 1c51e331..87940bde 100644 --- a/packages/nmp_platform/config/local.yaml +++ b/packages/nmp_platform/config/local.yaml @@ -1,5 +1,12 @@ -# Local development configuration for running platform with quickstart infrastructure -# Usage: NMP_CONFIG_FILE_PATH=packages/nmp_platform/config/local.yaml uv run nemo services run +# Local development for nemo services run (SQLite entity store, embedded auth). +# +# set -a && source packages/nmp_platform/config/local.env && set +a +# export NMP_BASE_URL=http://127.0.0.1:8080 +# uv run nemo services run --host 127.0.0.1 --port 8080 +# +# Use default service set (omit --services) or --service-group all. Then: +# nemo auth login --unsigned-token +# uv run nemo-platform run task --task nmp.platform_seed platform: runtime: "docker" @@ -13,7 +20,8 @@ platform: service: {} auth: - enabled: false + enabled: true + allow_unsigned_jwt: true # local CLI: nemo auth login --unsigned-token policy_decision_point_provider: embedded policy_decision_point_base_url: "http://localhost:8080" # Low timeouts for fast test feedback (same as integration tests) @@ -34,11 +42,12 @@ auth: # default_scopes: "platform:read platform:write openid profile email offline_access" # scope_prefix: "api://nmp/" -# Entities service configuration (uses DATABASE_* env vars from local.env) +# Entities service configuration (SQLite via DATABASE_* in local.env) entities: {} # Jobs service configuration jobs: + # Explicitly register the subprocess executor at profile "default". This opts # the documented `cpu/default` plugin steps (Data Designer create, Evaluator # metrics, Anonymizer, hello-world, etc.) into the cpu→subprocess translation @@ -62,6 +71,17 @@ jobs: ttl_seconds_before_active: 60 ttl_seconds_active: 3600 ttl_seconds_after_finished: 300 + # Uncomment for using customizer + # - provider: cpu + # profile: gpu + # backend: docker + # config: + # launcher_tool_path: ./services/core/jobs/jobs-launcher/jobs-launcher + # - provider: gpu + # profile: gpu + # backend: docker + # config: + # launcher_tool_path: ./services/core/jobs/jobs-launcher/jobs-launcher # Local path to the jobs-launcher binary used by the Docker job backend executor_defaults: diff --git a/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml b/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml index da199f07..b85998d4 100644 --- a/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml +++ b/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml @@ -1,13 +1,16 @@ +# Bundled fallback when NMP_CONFIG_FILE_PATH is unset. For local dev, prefer: +# source packages/nmp_platform/config/local.env (SQLite + paths) platform: runtime: "docker" - base_url: "http://0.0.0.0:8080" + base_url: "http://127.0.0.1:8080" service: {} auth: - enabled: false + enabled: true + allow_unsigned_jwt: true policy_decision_point_provider: embedded - policy_decision_point_base_url: "http://localhost:8080" + policy_decision_point_base_url: "http://127.0.0.1:8080" policy_data_refresh_interval: 2 bundle_cache_seconds: 0 admin_email: "admin@example.com" diff --git a/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py b/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py index 39eed664..8c4310c8 100644 --- a/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py +++ b/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py @@ -64,6 +64,7 @@ OPENAPI_SERVICES = [ "auth", + "customization", "entities", "evaluation", "files", diff --git a/packages/nmp_platform_runner/tests/test_registry.py b/packages/nmp_platform_runner/tests/test_registry.py index 504bcf3d..74798380 100644 --- a/packages/nmp_platform_runner/tests/test_registry.py +++ b/packages/nmp_platform_runner/tests/test_registry.py @@ -90,6 +90,26 @@ def test_openapi_services_are_explicit_and_do_not_auto_include_plugins(monkeypat assert registry.get_openapi_service_names(available) == ["auth", "evaluation"] +def test_customization_in_openapi_when_plugin_service_available(monkeypatch): + clear_registry_caches() + + class CustomizationService(NemoService): + name = "customization" + + def get_routers(self) -> list[RouterSpec]: + return [RouterSpec(router=APIRouter())] + + monkeypatch.setattr( + registry, + "AVAILABLE_SERVICES", + {"auth": "nmp.core.auth.main:service"}, + ) + monkeypatch.setattr(registry, "discover_services", lambda: {"customization": CustomizationService}) + + available = registry.get_available_services() + assert "customization" in registry.get_openapi_service_names(available) + + def test_intake_is_registered_as_api_and_openapi_service(): clear_registry_caches() available = registry.get_available_services() diff --git a/patch.diff b/patch.diff new file mode 100644 index 00000000..7be122f6 --- /dev/null +++ b/patch.diff @@ -0,0 +1,17985 @@ +diff --git a/docker-bake.automodel.hcl b/docker-bake.automodel.hcl +new file mode 100644 +index 000000000..89fc97453 +--- /dev/null ++++ b/docker-bake.automodel.hcl +@@ -0,0 +1,202 @@ ++# nmp-automodel image bake - run from Platform repo root (context = "."). ++# ++# Inspect targets (no build; finishes in ~0s): ++# docker buildx bake --print -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels ++# ++# Build wheels (override registry/tag via env, not --set): ++# export WHEELS_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev ++# export WHEELS_TAG=$(git rev-parse --short HEAD) ++# docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels --push ++# ++# Build automodel images: ++# docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-base-builder ++# ++# Published tags: nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-{base,tasks,training}: ++ ++variable "IMAGE_REGISTRY" { ++ default = "nvcr.io/0921617854601259/nemo-platform-dev" ++} ++ ++variable "BASE_REGISTRY" { ++ default = "nvcr.io/0921617854601259/nemo-platform-dev" ++} ++ ++variable "WHEELS_REGISTRY" { ++ default = "nvcr.io/0921617854601259/nemo-platform-dev" ++} ++ ++variable "BAKE_TAG" { ++ default = "local" ++} ++ ++variable "BASE_TAG_AUTOMODEL" { ++ default = "local" ++} ++ ++variable "WHEELS_TAG" { ++ default = "3fd6986ff173b598446ffac06d9be3f84b482495" ++} ++ ++variable "CUDA_VERSION" { ++ default = "12.8.1" ++} ++ ++variable "MAMBA_22_COMMIT" { ++ default = "6b32be06d026e170b3fdaf3ae6282c5a6ff57b06" ++} ++ ++variable "MAMBA_23_COMMIT" { ++ default = "v2.3.0" ++} ++ ++variable "CAUSAL_CONV1D_VERSION" { ++ default = "v1.5.3" ++} ++ ++# For local builds: --set "*.platform=linux/amd64" ++variable "BUILD_PLATFORMS" { ++ default = ["linux/amd64", "linux/arm64"] ++} ++ ++function "wheel_tags" { ++ params = [name] ++ result = ["${WHEELS_REGISTRY}/${name}:${WHEELS_TAG}"] ++} ++ ++function "get_causal_conv1d_wheel_image" { ++ params = [] ++ result = "${WHEELS_REGISTRY}/causal-conv1d-wheel:${WHEELS_TAG}" ++} ++ ++function "get_mamba_ssm_wheel_image" { ++ params = [] ++ result = "${WHEELS_REGISTRY}/mamba-ssm-wheel:${WHEELS_TAG}" ++} ++ ++group "nmp-automodel-gpu-wheels" { ++ targets = [ ++ "causal-conv1d-wheel", ++ "mamba-ssm-wheel", ++ ] ++} ++ ++group "nmp-automodel" { ++ targets = [ ++ "nmp-automodel-base-builder", ++ "nmp-automodel-tasks-docker", ++ "nmp-automodel-training-docker", ++ "nmp-automodel-tasks-smoke-test", ++ "nmp-automodel-training-smoke-test", ++ ] ++} ++ ++# Pre-built mamba-ssm / causal-conv1d wheels (cp311, cp312, cu13.1.1). Pushed to WHEELS_REGISTRY. ++target "causal-conv1d-wheel" { ++ target = "causal-conv1d-wheel" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.mamba-wheel" ++ tags = wheel_tags("causal-conv1d-wheel") ++ args = { ++ CUDA_VERSION = CUDA_VERSION ++ CAUSAL_CONV1D_VERSION = CAUSAL_CONV1D_VERSION ++ } ++ platforms = BUILD_PLATFORMS ++} ++ ++target "mamba-ssm-wheel" { ++ target = "mamba-ssm-wheel" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.mamba-wheel" ++ tags = wheel_tags("mamba-ssm-wheel") ++ args = { ++ CUDA_VERSION = CUDA_VERSION ++ MAMBA_22_COMMIT = MAMBA_22_COMMIT ++ MAMBA_23_COMMIT = MAMBA_23_COMMIT ++ } ++ platforms = BUILD_PLATFORMS ++} ++ ++target "platform-workspace" { ++ target = "platform-workspace" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.platform-workspace" ++} ++ ++target "nmp-automodel-base-builder" { ++ target = "nmp-automodel-base-builder" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-base" ++ no-cache-filter = ["automodel-clone"] ++ tags = ["${IMAGE_REGISTRY}/nmp/automodel-base:${BAKE_TAG}"] ++ args = { ++ CAUSAL_CONV1D_WHEEL_IMAGE = get_causal_conv1d_wheel_image() ++ MAMBA_SSM_WHEEL_IMAGE = get_mamba_ssm_wheel_image() ++ } ++ platforms = BUILD_PLATFORMS ++} ++ ++target "nmp-automodel-tasks-docker" { ++ target = "runtime" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-tasks" ++ contexts = { ++ platform-workspace = "target:platform-workspace" ++ nmp-automodel-base = "target:nmp-automodel-base-builder" ++ } ++ tags = ["${IMAGE_REGISTRY}/nmp/automodel-tasks:${BAKE_TAG}"] ++ args = { ++ BASE_REGISTRY = BASE_REGISTRY ++ BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL ++ } ++ platforms = BUILD_PLATFORMS ++} ++ ++target "nmp-automodel-training-docker" { ++ target = "runtime" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-training" ++ contexts = { ++ platform-workspace = "target:platform-workspace" ++ nmp-automodel-base = "target:nmp-automodel-base-builder" ++ } ++ tags = ["${IMAGE_REGISTRY}/nmp/automodel-training:${BAKE_TAG}"] ++ args = { ++ BASE_REGISTRY = BASE_REGISTRY ++ BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL ++ } ++ platforms = BUILD_PLATFORMS ++} ++ ++target "nmp-automodel-tasks-smoke-test" { ++ target = "smoke-test" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-tasks" ++ contexts = { ++ platform-workspace = "target:platform-workspace" ++ nmp-automodel-base = "target:nmp-automodel-base-builder" ++ } ++ args = { ++ BASE_REGISTRY = BASE_REGISTRY ++ BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL ++ SMOKE_MARKER = "smoke_nmp_automodel_tasks" ++ } ++ output = ["type=cacheonly"] ++ platforms = BUILD_PLATFORMS ++} ++ ++target "nmp-automodel-training-smoke-test" { ++ target = "smoke-test" ++ context = "." ++ dockerfile = "services/automodel/docker/Dockerfile.nmp-automodel-training" ++ contexts = { ++ platform-workspace = "target:platform-workspace" ++ nmp-automodel-base = "target:nmp-automodel-base-builder" ++ } ++ args = { ++ BASE_REGISTRY = BASE_REGISTRY ++ BASE_TAG_AUTOMODEL = BASE_TAG_AUTOMODEL ++ SMOKE_MARKER = "smoke_nmp_automodel_training" ++ } ++ output = ["type=cacheonly"] ++ platforms = BUILD_PLATFORMS ++} +diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/customization_contributor.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/customization_contributor.py +new file mode 100644 +index 000000000..ec2595f01 +--- /dev/null ++++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/customization_contributor.py +@@ -0,0 +1,27 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Contributor protocol for customization training backends.""" ++ ++from __future__ import annotations ++ ++from typing import TYPE_CHECKING, ClassVar, Protocol, runtime_checkable ++ ++import typer ++ ++if TYPE_CHECKING: ++ from nemo_platform_plugin.service import RouterSpec ++ ++ ++@runtime_checkable ++class CustomizationContributor(Protocol): ++ """One training backend mounted under ``/apis/customization``.""" ++ ++ name: ClassVar[str] ++ dependencies: ClassVar[list[str]] ++ ++ def get_routers(self) -> list[RouterSpec]: ++ """HTTP routes for this backend (workspace-scoped prefix per backend).""" ++ ++ def get_cli(self) -> typer.Typer | None: ++ """CLI subgroup mounted at ``nemo customization ``.""" +diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py +index 60eb440ba..68bd67b47 100644 +--- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py ++++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/discovery.py +@@ -22,6 +22,7 @@ Entry-point groups and their wrappers + ``nemo.docs`` → :func:`discover_docs` — ``() -> Path | dict`` callable + ``nemo.executors`` → :func:`discover_executors` — ``Executor`` class + ``nemo.inference_middleware`` → :func:`discover_inference_middleware` — :class:`~nemo_platform_plugin.inference_middleware.NemoInferenceMiddleware` subclass (typed, IGW instantiates) ++``nemo.customization.contributors`` → :func:`discover_customization_contributors` — :class:`~nemo_platform_plugin.customization_contributor.CustomizationContributor` instance (typed, customization router instantiates) + ``nemo.seed`` → :func:`discover_seed_jobs` — :class:`~nemo_platform_plugin.seed.NemoSeedJob` subclass (typed, platform instantiates) + + Wrappers for surfaces whose types are not yet defined in this package return +@@ -51,6 +52,7 @@ if TYPE_CHECKING: + from nemo_platform_plugin.cli import NemoCLI + from nemo_platform_plugin.controller import NemoController + from nemo_platform_plugin.function import NemoFunction ++ from nemo_platform_plugin.customization_contributor import CustomizationContributor + from nemo_platform_plugin.inference_middleware import NemoInferenceMiddleware + from nemo_platform_plugin.job import NemoJob + from nemo_platform_plugin.seed import NemoSeedJob +@@ -73,6 +75,7 @@ _ALL_SURFACE_GROUPS = ( + "nemo.docs", + "nemo.executors", + "nemo.inference_middleware", ++ "nemo.customization.contributors", + "nemo.seed", + ) + +@@ -95,9 +98,12 @@ _SURFACE_ALLOWLIST_ENV_VARS: dict[str, str] = { + "nemo.docs": "NEMO_PLUGIN_DOCS_ALLOWLIST", + "nemo.executors": "NEMO_PLUGIN_EXECUTORS_ALLOWLIST", + "nemo.inference_middleware": "NEMO_PLUGIN_INFERENCE_MIDDLEWARE_ALLOWLIST", ++ "nemo.customization.contributors": "NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST", + "nemo.seed": "NEMO_PLUGIN_SEED_ALLOWLIST", + } + ++CUSTOMIZATION_CONTRIBUTORS_GROUP = "nemo.customization.contributors" ++ + + def _manifest_plugin_name(group: str, entry_point_name: str) -> str: + if group in _DOT_SCOPED_GROUPS: +@@ -468,6 +474,71 @@ def discover_executors() -> dict[str, Any]: + return discover("nemo.executors") + + ++def _instantiate_customization_contributor(loaded: object) -> CustomizationContributor: ++ from nemo_platform_plugin.customization_contributor import CustomizationContributor ++ ++ if isinstance(loaded, type): ++ instance = loaded() ++ else: ++ instance = loaded ++ if not isinstance(instance, CustomizationContributor): ++ raise TypeError( ++ f"Expected CustomizationContributor instance, got {type(instance)!r}", ++ ) ++ return instance ++ ++ ++@cache ++def discover_customization_contributors() -> dict[str, CustomizationContributor]: ++ """Typed wrapper: discover ``nemo.customization.contributors`` entry-points. ++ ++ Returns a dict keyed by entry-point key (e.g. ``"automodel"``) mapping to a ++ :class:`~nemo_platform_plugin.customization_contributor.CustomizationContributor` ++ instance. Entry points may register a class (instantiated here) or a pre-built ++ instance. Broken contributors are skipped with a warning (same fault isolation as ++ :func:`discover`). ++ """ ++ from nemo_platform_plugin.customization_contributor import CustomizationContributor ++ ++ result: dict[str, CustomizationContributor] = {} ++ ++ for ep in discover_entry_points(CUSTOMIZATION_CONTRIBUTORS_GROUP).values(): ++ try: ++ loaded = ep.load() ++ contributor = _instantiate_customization_contributor(loaded) ++ key = getattr(type(contributor), "name", None) or ep.name ++ if key != ep.name: ++ logger.warning( ++ "Contributor entry-point key %r differs from class name %r; using entry-point key", ++ ep.name, ++ key, ++ ) ++ result[ep.name] = contributor ++ logger.debug( ++ "Loaded customization contributor %r from %s", ++ ep.name, ++ ep.value, ++ ) ++ except Exception: ++ logger.warning( ++ "Failed to load customization contributor %r (%s) — skipping", ++ ep.name, ++ ep.value, ++ exc_info=True, ++ ) ++ ++ return result ++ ++ ++def discover_customization_contributor_classes() -> dict[str, type]: ++ """Return contributor entry-point name → loaded class (for tests).""" ++ result: dict[str, type] = {} ++ for key, loaded in discover(CUSTOMIZATION_CONTRIBUTORS_GROUP).items(): ++ if isinstance(loaded, type): ++ result[key] = loaded ++ return result ++ ++ + def discover_inference_middleware() -> dict[str, type[NemoInferenceMiddleware]]: + """Typed wrapper: discover ``nemo.inference_middleware`` entry-points. + +diff --git a/packages/nemo_platform_plugin/tests/test_discovery.py b/packages/nemo_platform_plugin/tests/test_discovery.py +index 3eda7da6d..ab472b499 100644 +--- a/packages/nemo_platform_plugin/tests/test_discovery.py ++++ b/packages/nemo_platform_plugin/tests/test_discovery.py +@@ -13,8 +13,10 @@ from fastapi import APIRouter + from nemo_platform_plugin.cli import NemoCLI + from nemo_platform_plugin.discovery import ( + _ALL_SURFACE_GROUPS, ++ CUSTOMIZATION_CONTRIBUTORS_GROUP, + discover, + discover_cli, ++ discover_customization_contributors, + discover_entry_points, + discover_functions, + discover_jobs, +@@ -40,10 +42,12 @@ def clear_discovery_cache(): + discover_entry_points.cache_clear() + discover.cache_clear() + discover_manifests.cache_clear() ++ discover_customization_contributors.cache_clear() + yield + discover_entry_points.cache_clear() + discover.cache_clear() + discover_manifests.cache_clear() ++ discover_customization_contributors.cache_clear() + + + # --------------------------------------------------------------------------- +@@ -563,3 +567,49 @@ class TestDiscoverManifests: + result = discover_manifests() + assert list(result.keys()) == ["example"] + assert result["example"].version == "1.2.3" ++ ++ ++class TestDiscoverCustomizationContributors: ++ def test_group_in_all_surface_groups(self) -> None: ++ assert CUSTOMIZATION_CONTRIBUTORS_GROUP in _ALL_SURFACE_GROUPS ++ ++ def test_uses_customization_contributors_group(self) -> None: ++ with patch("nemo_platform_plugin.discovery.entry_points", return_value=[]) as mock_eps: ++ discover_customization_contributors() ++ mock_eps.assert_called_once_with(group=CUSTOMIZATION_CONTRIBUTORS_GROUP) ++ ++ def test_instantiates_contributor_class(self) -> None: ++ class _Contributor: ++ name = "fake" ++ dependencies = ["jobs"] ++ ++ def get_routers(self) -> list[RouterSpec]: ++ return [] ++ ++ def get_cli(self) -> None: ++ return None ++ ++ ep = _make_ep("fake", _Contributor) ++ with patch("nemo_platform_plugin.discovery.entry_points", return_value=[ep]): ++ result = discover_customization_contributors() ++ assert isinstance(result["fake"], _Contributor) ++ ++ def test_failing_contributor_is_skipped(self) -> None: ++ bad = _make_ep("bad", None) ++ bad.load.side_effect = RuntimeError("broken") ++ ++ class _Contributor: ++ name = "good" ++ dependencies = ["jobs"] ++ ++ def get_routers(self) -> list[RouterSpec]: ++ return [] ++ ++ def get_cli(self) -> None: ++ return None ++ ++ good = _make_ep("good", _Contributor) ++ with patch("nemo_platform_plugin.discovery.entry_points", return_value=[bad, good]): ++ result = discover_customization_contributors() ++ assert "bad" not in result ++ assert "good" in result +diff --git a/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py b/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py +index b3f9813bc..dafd73aa6 100644 +--- a/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py ++++ b/packages/nmp_platform_runner/src/nmp/platform_runner/registry.py +@@ -65,6 +65,7 @@ API_SERVICES = [ + OPENAPI_SERVICES = [ + "audit", + "auth", ++ "customization", + "entities", + "evaluation", + "files", +diff --git a/packages/nmp_platform_runner/tests/test_registry.py b/packages/nmp_platform_runner/tests/test_registry.py +index 4eba56620..c530368a9 100644 +--- a/packages/nmp_platform_runner/tests/test_registry.py ++++ b/packages/nmp_platform_runner/tests/test_registry.py +@@ -90,6 +90,26 @@ def test_openapi_services_are_explicit_and_do_not_auto_include_plugins(monkeypat + assert registry.get_openapi_service_names(available) == ["auth", "evaluation"] + + ++def test_customization_in_openapi_when_plugin_service_available(monkeypatch): ++ clear_registry_caches() ++ ++ class CustomizationService(NemoService): ++ name = "customization" ++ ++ def get_routers(self) -> list[RouterSpec]: ++ return [RouterSpec(router=APIRouter())] ++ ++ monkeypatch.setattr( ++ registry, ++ "AVAILABLE_SERVICES", ++ {"auth": "nmp.core.auth.main:service"}, ++ ) ++ monkeypatch.setattr(registry, "discover_services", lambda: {"customization": CustomizationService}) ++ ++ available = registry.get_available_services() ++ assert "customization" in registry.get_openapi_service_names(available) ++ ++ + def test_intake_is_registered_as_api_and_openapi_service(): + clear_registry_caches() + available = registry.get_available_services() +diff --git a/plugins/nemo-automodel/README.md b/plugins/nemo-automodel/README.md +new file mode 100644 +index 000000000..3c6df3a55 +--- /dev/null ++++ b/plugins/nemo-automodel/README.md +@@ -0,0 +1,30 @@ ++# nemo-automodel-plugin ++ ++Automodel training contributor under `/apis/customization/v2/workspaces/{workspace}/automodel/`. ++ ++Requires **`nemo-customizer-plugin`** at runtime (router + `client.customization` SDK) and **`nmp-automodel`** (compiler/tasks). The Automodel plugin does not declare a pyproject dependency on the customizer plugin — install both via root `enabled-plugins`: ++ ++```bash ++uv sync --group enabled-plugins ++``` ++ ++## CLI ++ ++Verbs are mounted directly on the contributor (no `jobs` subgroup): ++ ++```bash ++nemo customization automodel explain ++nemo customization automodel submit path/to/job.json ++nemo customization automodel submit path/to/job.json -w acme-corp ++nemo customization automodel submit path/to/job.json --cluster my-cluster ++``` ++ ++`run` is registered but **always fails** — Automodel training is submit-only (platform API / Docker GPU jobs), not local subprocess execution: ++ ++```bash ++nemo customization automodel run path/to/job.json # exits with error ++``` ++ ++Other customization backends may still use `nemo customization jobs submit ...`. ++ ++Job JSON uses the simplified `AutomodelJobInput` schema (see `nemo_automodel_plugin/schema.py`). Submit posts to `/apis/customization/v2/workspaces/{workspace}/automodel/jobs`. +diff --git a/plugins/nemo-automodel/SCOPE.md b/plugins/nemo-automodel/SCOPE.md +new file mode 100644 +index 000000000..1d5f8c85a +--- /dev/null ++++ b/plugins/nemo-automodel/SCOPE.md +@@ -0,0 +1,967 @@ ++# NeMo Automodel Plugin — Work Scope ++ ++**Start here:** [Implementation order](#implementation-order) (sequence, checklists, success criteria). ++ ++This document scopes the work to replace the legacy Customizer Automodel path with a first-party **NeMo Automodel plugin** (customization **contributor**), the **`nemo-customizer-plugin`** router at `/apis/customization`, and the **`nmp-automodel`** task/compiler package (no standalone HTTP server). Legacy `Platform/services/customizer/` is reference only. New work: `plugins/nemo-customizer/`, `plugins/nemo-automodel/`, `services/automodel/`. ++ ++Training is powered by the upstream **`nemo_automodel`** library (repo: `Automodel/` at workspace root, NGC image `nvcr.io/nvidia/nemo-automodel:25.11.00`). ++ ++--- ++ ++## Implementation order ++ ++Canonical sequencing for this scope. **Work breakdown** (below) and design sections add detail; checklists live here only. ++ ++### Sequence overview ++ ++| Step | Focus | Package / area | Blocks | ++|------|--------|----------------|--------| ++| **0** | Design lock + platform Jobs flag | cross-cutting | — | ++| **1** | Customization router | `plugins/nemo-customizer` | Automodel HTTP (step 4) | ++| **2** | Task/compiler library | `services/automodel` (`nmp-automodel`) | Images (step 3), contributor compile (step 4) | ++| **3** | Container images | `nmp-automodel` Dockerfiles | E2E GPU runs | ++| **4** | Automodel plugin + Docker gate | `plugins/nemo-automodel` | CLI submit (step 5), integration (step 6) | ++| **5** | CLI submit path | `nemo-automodel` + router CLI | — | ++| **6** | Tests & contracts | `Platform/tests/...` | — | ++| **7** | SDK, OpenAPI, docs, deploy | platform + plugins | — | ++ ++**Parallel OK:** Step 0 Jobs flag with step 1–2. Step 2 compiler port with step 1 router (after contributor protocol is sketched). ++ ++```mermaid ++flowchart LR ++ S0[Step 0 Design lock] ++ S1[Step 1 nemo-customizer] ++ S2[Step 2 nmp-automodel] ++ S3[Step 3 Images] ++ S4[Step 4 nemo-automodel plugin] ++ S5[Step 5 CLI] ++ S6[Step 6 Tests] ++ S7[Step 7 Docs deploy] ++ S0 --> S1 ++ S0 --> S2 ++ S1 --> S4 ++ S2 --> S3 ++ S2 --> S4 ++ S3 --> S6 ++ S4 --> S5 ++ S4 --> S6 ++ S5 --> S7 ++ S6 --> S7 ++``` ++ ++--- ++ ++### Step 0 — Design lock & platform prerequisites ++ ++Lock names, routes, schemas, and cross-cutting Jobs config before feature PRs. First implementation PR can be plugin + `nmp-automodel` without Studio migration. ++ ++**Design lock checklist** ++ ++- [x] **Name & routes:** Router `NemoService.name = customization`; Automodel contributor prefix `v2/workspaces/{workspace}/automodel` → `/apis/customization/v2/workspaces/{workspace}/automodel/...`; CLI `nemo customization automodel` — [URL routing](#url-routing-decided), [Customization router](#customization-router-in-scope--v1). ++- [x] **Workspace contract:** Path `{workspace}` authoritative; spec uses workspace-relative names + optional `ws/name` qualifiers; dataset URI rules documented; **no** `workspace` key in JSON body — [Workspace scoping](#workspace-scoping-required). ++- [x] **Simplified JSON schema:** Publish `AutomodelJobInput` (v1) for POST/CLI; `AutomodelJobOutput` for stored/GET; `extra="forbid"` — [Simplified JSON spec](#simplified-json-spec-draft--automodeljobinput-only). ++- [x] **Schema validators (legacy parity):** Reject `output_model` with message to use `output` (legacy `CustomizationJobInput`); `model_config` / field validators for distillation-only fields when `training_type: sft`. ++- [x] **Dataset shape:** `{ training, validation? }` fileset URIs; `to_spec()` runs `check_dataset_access` per ref (port `platform_client`) — legacy API used a single `dataset` string; mapping documented in migration guide. ++- [x] **Integrations:** `wandb` / `mlflow` accept `api_key_secret` (`SecretRef`) plus enabled/project fields — not only `null` placeholders. ++- [x] **v1 exclusions (locked):** `deployment_config` (post-train NIM deploy), embedding-model SFT, DPO/GRPO — see [Decisions](#decisions-resolved). ++- [x] **Input vs canonical spec (Option A):** Two schemas + `to_spec()` — port `transform_input_to_output` — [Input vs canonical spec](#input-vs-canonical-spec--decided-option-a). ++- [x] **Deprecation / Studio:** Legacy customizer not in default `AVAILABLE_SERVICES`; UI feature-flagged off — [Deprecation](#deprecation--platform-spin-up-and-studio-verified). ++ ++**Workspace registration (do before first integration test):** ++ ++- [x] Add `plugins/nemo-customizer`, `plugins/nemo-automodel`, `services/automodel` to root `Platform/pyproject.toml` workspace members. ++- [x] Add `nemo-customizer-plugin` and `nemo-automodel-plugin` to `[dependency-groups] enabled-plugins` (pattern: `nemo-evaluator-plugin`). ++ ++**Platform Jobs — `jobs.enable_subprocess_executor`** (cross-cutting, not Automodel-only; rationale in [Platform jobs: `runtime` vs step executors](#platform-jobs-runtime-vs-step-executors)): ++ ++- [x] Add field to `JobsServiceConfig` (`Platform/services/core/jobs/src/nmp/core/jobs/config.py`). ++- [x] Gate `SubprocessJobExecutionProfile` in `get_default_executor_profiles_for_runtime()` (K8s default `false`, docker local default `true`). ++- [ ] Document in `Platform/services/core/jobs/README.md`. ++- [ ] Expose in `packages/nmp_platform/config/local.yaml` and `nmp_platform_runner` local config. ++- [ ] `GET /v2/execution-profiles` reflects the flag. ++ ++--- ++ ++### Step 1 — `nemo-customizer` (blocks Automodel HTTP) ++ ++**Problem:** `discover_services()` maps each `nemo.services` key to one `/apis//` mount — only one owner for `customization`. Training backends (Automodel, RL, Megatron, Unsloth) must share one URL tree without a monolithic `nmp-customizer` or per-backend top-level services. ++ ++**Solution:** New package `plugins/nemo-customizer/` (`nemo_customizer`) ships the sole `nemo.services` → `customization` registration. Backends register as **contributors** via `nemo.customization.contributors`. Full design: [Customization router](#customization-router-in-scope--v1). ++ ++**Router behavior (implement in this step):** ++ ++1. `discover_customization_contributors()` — fault-isolated; allowlist `NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST` (or `NEMO_PLUGIN_ALLOWLIST`). ++2. **Zero contributors** → fail startup with clear error. ++3. `CustomizationRouterService.get_routers()` — merge `RouterSpec` lists; **`dependencies`** = union of contributor + platform deps. ++4. `CustomizationCLI.get_cli()` — `typer.Typer(name="customization")` + mount contributor subgroups (`automodel`, …). ++5. OpenAPI / SDK — single service name `customization` when router + ≥1 contributor enabled. ++6. **Route collision guard** — distinct segment per contributor under `.../workspaces/{workspace}/`; legacy `.../jobs` unmounted in v1. ++ ++**`nemo-customizer-plugin` pyproject.toml:** ++ ++```toml ++[project.entry-points."nemo.services"] ++customization = "nemo_customizer.router:CustomizationRouterService" ++ ++[project.entry-points."nemo.cli"] ++customization = "nemo_customizer.cli:CustomizationCLI" ++``` ++ ++**Deliverables** ++ ++- [x] `CustomizationContributor` protocol in `nemo_platform_plugin/customization_contributor.py`; `discover_customization_contributors()` in `nemo_platform_plugin/discovery.py` (fault-isolated via `discover_entry_points`; allowlist `NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST` or `NEMO_PLUGIN_ALLOWLIST`). `nemo_customizer/discovery.py` re-exports for backward compatibility. ++- [x] **Zero contributors:** fail router startup with a clear error (do not mount an empty `/apis/customization` tree silently). ++- [x] `CustomizationRouterService` + `CustomizationCLI` (merge contributors); **`dependencies`** = union of all contributor `dependencies` plus platform deps (`entities`, `jobs`, `auth`, …). ++- [x] Entry points: `nemo.services` + `nemo.cli` → key `customization`. ++- [x] Unit tests: two fake contributors → merged routes; prefix collision detection; zero contributors → startup error. ++- [x] `OPENAPI_SERVICES` / registry: include `customization` when router plugin enabled **and** ≥1 contributor discovered. ++- [x] `docs/CUSTOMIZATION.md` — contributor author guide (RL / Megatron / Unsloth). ++- [x] Workspace members + `enabled-plugins` — [Step 0 workspace registration](#step-0--design-lock--platform-prerequisites). ++ ++**Out of scope:** Legacy `POST .../workspaces/{ws}/jobs` multi-backend path; Studio cutover. ++ ++--- ++ ++### Step 2 — `nmp-automodel` package core ++ ++`Platform/services/automodel/` — Python package **`nmp-automodel`**: compilers, task entrypoints, Dockerfiles. **No HTTP server** (unlike legacy `customizer-server`). Reference port: `Platform/services/customizer/` (trim multi-backend paths only). ++ ++**4-step `PlatformJobSpec` pipeline** (Automodel-only): ++ ++1. `file_io` (CPU) — download model + datasets (`nmp/automodel-tasks` image). ++2. Training (GPU) — `finetune.py` + `nemo_automodel` recipes (SFT + KD); `nmp/automodel-training` image. ++3. `file_io` upload. ++4. `model_entity` — register model in Models service (behavior unchanged from legacy). ++ ++| Area | Source | Action | ++|------|--------|--------| ++| Automodel config compiler | `tasks/training/backends/automodel/config.py` | Move; drop non-automodel imports; SFT + `_configure_kd()` | ++| Training runner/backend | `backend.py`, `finetune.py`, `callbacks.py`, `checkpoints.py` | Move; keep `JobsServiceProgressReporter` + `TrainingProgressCallback` (rank-0) | ++| Training step compiler | `app/jobs/training/compiler.py` | **Strip** to automodel-only; fixed `nmp/automodel-training` image ref | ++| Job compiler | `app/jobs/compiler.py` | **Strip** DPO/RL/`nemo_rl`/`megatron_bridge`; keep distillation (KD); 4-step only | ++| File I/O tasks | `tasks/file_io/` | Ported: `run.py`, `callbacks.py`, `utils.py`, `progress_reporter.py` | ++| Model entity task | `tasks/model_entity/` | Move unchanged behavior | ++| Schemas | `api/v2/jobs/schemas.py` | `AutomodelJobInput` + `AutomodelJobOutput` (+ sub-models) for plugin `to_spec` / compiler | ++ ++**Deliverables** ++ ++- [x] `nmp-automodel` installable; task entry points via console scripts + `nemo-platform run task --task nmp.automodel.tasks.*`. ++- [x] Unit tests: adapter + compiler (`services/automodel/tests/`); contract `generate_configs.py` imports `nmp.automodel`. ++- [x] Prove `PlatformJobSpec` generation for SFT (4-step pipeline, `nmp.automodel.tasks.*` commands, `nmp/automodel-training` / `nmp/automodel-tasks` images). ++- [x] `validate_for_training()` on legacy `CustomizationJobOutput` (compiler); plugin `AutomodelJobOutput` has parallel validator in `nemo_automodel_plugin.schema`. ++- [x] `platform_client.py` — `fetch_model_entity`, `check_dataset_access`. ++- [x] `_resolve_v4_compatible()` in training compiler. ++- [x] Task modules `nmp.automodel.tasks.{file_io,training,model_entity}`; compiled steps use `nmp.automodel.tasks.*`. ++- [x] `AutomodelConfig.default_training_execution_profile` (`NMP_AUTOMODEL_*`); adapter + compile wrapper apply request `profile`. ++ ++**Internal Jobs callback path** (not a new public route — same contract as legacy customizer): ++ ++- [x] `NMPJobContext` env vars for job id, step, workspace, task name. ++- [x] `JobsServiceProgressReporter` / `TrainingProgressCallback` → `sdk.jobs.tasks.create_or_update` (rank-0 only). ++- [ ] Document as internal; exclude from public OpenAPI if auxiliary routes are added. ++ ++*Optional later:* webhook-style callbacks — out of initial scope. ++ ++--- ++ ++### Step 3 — Container images ++ ++Two runtime images (`nmp/automodel-tasks`, `nmp/automodel-training`) built from `nmp/automodel-base` (PyTorch + `nemo_automodel` deps), published under `nvcr.io/0921617854601259/nemo-platform-dev/nmp/...` — not the upstream NGC `nvcr.io/nvidia/nemo-automodel` training container name and not full `nmp-customizer` / RL / Megatron stack. Do **not** reuse or extend `customizer-automodel` during transition. ++ ++| Image key | Dockerfile | Used by | Contents | ++|-----------|------------|---------|----------| ++| `nmp/automodel-training` | `Dockerfile.nmp-automodel-training` | GPU training step | `nmp/automodel-base` + `nmp-automodel` finetune backend (SFT + KD recipes) | ++| `nmp/automodel-tasks` | `Dockerfile.nmp-automodel-tasks` | CPU steps (`file_io`, `model_entity`) | Slim glue; task entrypoints without customizer API server / RL / Megatron | ++ ++**Deliverables** ++ ++- [ ] Wire both keys in plugin `get_qualified_image()` / `NMP_AUTOMODEL_*` env overrides. ++- [ ] CI: smoke import on **training** image (pattern: `Platform/tests/smoke_gpu/test_customizer_automodel.py`); lighter smoke on **tasks** image. ++- [ ] Plugin README: size/dependency audit vs `customizer-automodel`. ++- [ ] Helm/assets: image refs (Studio cutover to new URLs still out of scope). ++ ++--- ++ ++### Step 4 — `nemo-automodel` plugin (contributor + job) ++ ++Plugin HTTP only — merged by router at `/apis/customization/.../automodel/...`. Requires **step 1** (`nemo-customizer-plugin`) in workspace. **`compile()`** depends on **step 2** (`nmp-automodel`). ++ ++**Automodel plugin `pyproject.toml` (contributor — not `nemo.services`):** ++ ++```toml ++[project.entry-points."nemo.customization.contributors"] ++automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" ++ ++[project.entry-points."nemo.jobs"] ++"customization.automodel.jobs" = "nemo_automodel_plugin.jobs.jobs:AutomodelJob" ++``` ++ ++**Deliverables** ++ ++- [x] **pyproject.toml:** `nemo-platform-plugin`, `nmp-automodel` (no `nemo-customizer-plugin` wheel dep — router installed via `enabled-plugins` only). Entry points: ++ - `nemo.customization.contributors` → `AutomodelContributor` (`automodel`) ++ - `nemo.jobs` → `customization.automodel.jobs` → `AutomodelJob` ++ - optional `nemo.docs` (no `nemo.services` / no top-level `nemo.cli`) ++- [x] **`AutomodelContributor.get_routers()`** — optional `.../automodel/healthz`; mount jobs via `add_job_routes` (see wiring below); prefix `v2/workspaces/{workspace}/automodel`; `job_collection_path = "/automodel/jobs"`. ++- [x] **`add_job_routes` wiring (required):** ++ - `service_name="customization"` — Jobs `source`, list filters, and OpenAPI service segment (default `_derive_service_name()` → `nemo-automodel-plugin` is **wrong**). ++ - `generate_job_name=generate_automodel_id` — `automodel-{uuid.hex[:12]}` when body omits `name` (same pattern as legacy `generate_customization_id`). ++ - `route_options=[JobRouteOption.CORE]` — create/list/get/delete/status/cancel/results; **no** PAUSE_RESUME in v1 (legacy parity). ++ - `default_profile` from plugin config when spec omits `training.execution_profile`. ++ - Request-body `profile` on `BaseJobRequest` — **deferred** (platform `add_job_routes` still drops it); v1 uses **`training.execution_profile`** in JSON only. ++- [x] **`AutomodelJob`:** `description` set; `input_spec_schema` / `spec_schema` / `to_spec()` (Option A); `compile()` on `AutomodelJobOutput` only; `dependencies`: `entities`, `auth`, `jobs`, `secrets`, `files`, `models`. ++- [ ] **Job envelope:** `description`, `project`, `ownership`, `custom_fields` — inherited from `job_route_factory` (no Automodel-specific fields); document in README. ++- [x] **`get_cli()`** — `automodel` Typer subgroup via `add_job_commands` (`jobs` → `run` / `submit` / `explain` to `.../automodel/jobs`). Data Designer–style `cli/inputs.py` simplified JSON is [Step 5](#step-5--cli-submit-path), not required for Step 4. ++- [x] SDK: `nemo-customizer-plugin` owns `nemo.sdk` → `customization`; composes `client.customization.automodel` from `nemo-automodel-plugin`. `nemo.docs` if user docs ship with plugin. ++- [x] Workspace members + `enabled-plugins` — [Step 0 workspace registration](#step-0--design-lock--platform-prerequisites). ++ ++**Docker enforcement & GPU validation** (`nemo_platform_plugin.jobs.docker` today: `validate_gpu_available_for_docker` only when `runtime == DOCKER` and reserved GPU list is empty — **extend for all Automodel jobs**): ++ ++- [x] At **compile** (plugin `compile()` or shared helper): require `NemoPlatformConfig.runtime == DOCKER`. ++- [x] Require `validate_docker_available()` (daemon reachable). ++- [x] Require GPU pool configured (reuse or extend `validate_gpu_available_for_docker`). ++- [x] `PlatformJobCompilationError` → 422, e.g. *“Automodel training requires `platform.runtime: docker` with GPU-backed container execution (Docker daemon reachable and GPUs configured).”* ++- [x] Do **not** silently downgrade `platform.runtime` to `NONE` for this plugin. ++- [ ] Do **not** conflate with `jobs.enable_subprocess_executor` — Automodel never schedules `subprocess` training steps. ++ ++--- ++ ++### Step 5 — CLI submit path ++ ++First-class CLI for simplified JSON jobs (pattern: Data Designer `[CONFIG_SOURCE]` → canonical spec in `plugins/nemo-data-designer/.../cli/inputs.py`). Commands hang under `nemo customization automodel` (router CLI + contributor subgroup). ++ ++**Submit URL** (via `nemo_platform_plugin.commands` job submit helper): ++ ++`/apis/customization/v2/workspaces/{workspace}/automodel/jobs` ++ ++Custom wrappers must **forward** `--workspace` / `-w` to the framework callback (default `"default"` for local dev only). ++ ++**Deliverables** ++ ++- [x] `nemo customization automodel jobs submit --workspace ` — `cli/inputs.py` validates `AutomodelJobInput` and POSTs to `.../automodel/jobs` (`tests/test_cli.py`). ++- [x] `jobs explain` — exposes `input_spec_schema` + `spec_schema` via framework `explain` (`tests/test_cli.py`). ++- [x] CLI tests: `-w` / `--workspace` in submit URL (`submit_path_for` + mocked `submit_remote`). ++ ++--- ++ ++### Step 6 — Tests & contract continuity ++ ++Relocate contract tests from legacy `customizer-automodel` path; validate router + contributor + compiler together. ++ ++**Deliverables** ++ ++- [x] Contract script import path fixed (`generate_configs.py` → `backends.config`); `services/automodel/tests/test_contract_configs.py` parses SFT/packing inputs + optional `--check` (embedding gated/skipped for v1). ++- [x] Unit/API: Automodel + customization router routes under `/v2/workspaces/{workspace}/automodel/...` (`plugins/nemo-automodel/tests/test_api.py`). ++- [x] Integration: compile-only via `services/automodel/tests/test_compiler.py` (contract fixture when present); CLI submit mocked in `test_cli.py`. ++- [ ] Agentic smoke: adapt `Platform/tests/agentic-use/customizer-lora-job-cli` → `nemo customization automodel` CLI. ++- [ ] E2E: job completes → **Model** entity exists → fileset populated → LoRA metadata when `finetuning_type=lora`. ++- [x] Workspace isolation: routes scoped by `{workspace}` path segment (`test_api.py`); full cross-workspace API test deferred to Jobs service integration. ++ ++--- ++ ++### Step 7 — SDK, OpenAPI, docs & rollout ++ ++**API & SDK polish** ++ ++- [ ] OpenAPI tags: “Automodel Training Jobs”. ++- [ ] List/get/delete/results routes via `add_job_routes` defaults under `/v2/workspaces/{workspace}/automodel/jobs`. ++- [x] SDK hub: `client.customization.automodel.jobs.create(workspace=..., spec=...)` — paths under `/v2/workspaces/{workspace}/automodel/jobs`; **no** silent global namespace default (document `workspace="default"` for local dev). ++- [ ] Error mapping: `PlatformJobCompilationError` / `validate_for_training` → 422; `check_dataset_access` / model entity auth failures → 403 or 422 with clear copy. ++- [ ] Migration guide field table: legacy flat `training` + single `dataset` string → `AutomodelJobInput` sections; `output_model` → `output`. ++ ++**Docs & deploy** ++ ++- [ ] Automodel plugin README: install, enabled-plugins, CLI examples, sample `job.json`. ++- [ ] Config reference: `NMP_AUTOMODEL_*` (training/tasks image overrides, resource defaults); link `NMP_JOBS_ENABLE_SUBPROCESS_EXECUTOR` / [Step 0](#step-0--design-lock--platform-prerequisites). ++- [ ] Migration guide: `CustomizationJob` / `CustomizationJobInput` → `AutomodelJobInput` field mapping. ++- [ ] Helm/assets: deploy `nmp/automodel-training` + `nmp/automodel-tasks` (replace `customizer-automodel` on product cutover — Studio migration still out of scope). ++ ++--- ++ ++### Success criteria (exit checks) ++ ++- [ ] `nemo customization automodel jobs submit job.json -w acme-corp` → `/apis/customization/v2/workspaces/acme-corp/automodel/jobs`; fails fast without Docker/GPU. ++- [ ] `POST` accepts `AutomodelJobInput`; GET returns enriched `AutomodelJobOutput` in `acme-corp`. ++- [ ] Completed job: **Model** entity + fileset + adapter metadata in same workspace. ++- [ ] Training progress on Jobs task `status_details.metrics`. ++- [ ] Training image CI smoke passes. ++- [ ] No legacy `platform_job_config_compiler` / multi-backend customizer dependency. ++- [ ] Router test: second fake contributor merges without router code changes. ++ ++--- ++ ++## Goals (from requirements) ++ ++| Requirement | Intent | ++|-------------|--------| ++| **First-class CLI** | Submit/run jobs from a **simplified JSON** job config (not the full CustomizationJob API surface). Pattern: Data Designer’s `[CONFIG_SOURCE]` → canonical spec (`plugins/nemo-data-designer/.../cli/inputs.py`). | ++| **Fail if Docker disabled for jobs** | Automodel training is GPU + container-only. Reject compile/submit when `platform.runtime` is not `docker` or Docker daemon/GPUs are unavailable (stricter than today’s “warn and set runtime NONE”). Independent of `jobs.enable_subprocess_executor`. | ++| **First-class API** | Workspace-scoped REST under `/apis/customization/v2/workspaces/{workspace}/automodel/...` — `{workspace}` is a **required path segment** on every job route (create, list, get, delete, results). Served via the **customization router** (single `/apis/customization` mount); Automodel is the first contributor. | ++| **Customization router** | **`nemo-customizer-plugin`** owns `/apis/customization` and merges HTTP/CLI/SDK from contributors (Automodel v1; RL / Megatron / Unsloth later) — no monolithic `nmp-customizer`, no per-backend top-level `/apis/*` services. | ++| **Automodel-only job path** | No NeMo RL, Megatron-Bridge, DPO, GRPO, or multi-backend dispatch. Single compiler → single training step image. | ++| **Internal callback API** | Keep task-level progress updates to the Jobs service (`sdk.jobs.tasks.create_or_update`) from training subprocesses — not a public user API. | ++| **Simplified training image** | New image derived from `nemo-automodel` NGC base with only platform task glue + `nemo_automodel`, not full `nmp-customizer` / RL / Megatron stack. | ++| **Entity lifecycle** | Jobs still: download artifacts → train → upload checkpoint → **create/update Model entity** (and LoRA adapter metadata where applicable). | ++| **Jobs API parity** | `service_name="customization"` on `add_job_routes`; auto `automodel-{id}` names; `training.execution_profile` in spec; CORE routes only. | ++ ++--- ++ ++## Platform jobs: `runtime` vs step executors ++ ++Two layers are easy to conflate; this plugin only cares about the second for **training steps**, but operators need both clear in config and docs. ++ ++| Layer | Config | Cardinality | Meaning | ++|-------|--------|-------------|---------| ++| **Platform deployment** | `platform.runtime` | **One value** per process (`docker` \| `kubernetes` \| `none`) | How the platform orchestrates container workloads (Docker daemon vs K8s vs neither). **Not** “how every job step runs.” | ++| **Job step execution** | `platform_spec.steps[].executor` | **Per step** | Backend for that step: `cpu`/`gpu` + container → Docker or K8s; `subprocess` → host process (local dev / lightweight tasks). | ++ ++Today, when `platform.runtime: docker`, the Jobs service **implicitly** also registers `subprocess/default` (host execution) alongside `cpu/default` and `gpu/default` (Docker). That coupling is what makes `runtime: docker` sound like “everything runs in Docker.” ++ ++### Proposed: `jobs.enable_subprocess_executor` ++ ++Make host subprocess execution an **explicit** platform choice instead of a side effect of `runtime: docker`. ++ ++| Field | Type | Default | Behavior | ++|-------|------|---------|----------| ++| `jobs.enable_subprocess_executor` | `bool` | `true` when `platform.runtime == docker` (local dev); **`false` on Kubernetes** unless explicitly set `true` | When `true`, register `subprocess/default` and allow steps with `provider: subprocess`. When `false`, omit subprocess from default profiles; CPU/GPU container steps use Docker (or K8s) only. Dev clusters may opt in explicitly; production K8s should leave host execution disabled. | ++ ++**Implementation:** [Step 0 — Platform Jobs flag](#step-0--design-lock--platform-prerequisites) (cross-cutting, not Automodel-only). ++ ++**Automodel plugin implications:** ++ ++- Training steps are **always** `cpu`/`gpu` + container → Docker; Automodel does **not** depend on `enable_subprocess_executor`. ++- Compile gate ([Step 4](#step-4--nemo-automodel-plugin-contributor--job)): **`platform.runtime == docker`** + daemon + GPUs — not “subprocess enabled.” ++- Optional `jobs run` ([Step 4](#step-4--nemo-automodel-plugin-contributor--job) 2b): subprocess only if the flag is enabled. ++- Prefer error copy: *“Automodel training requires `platform.runtime: docker` with GPU-backed container execution”* — avoid *“Docker job runtime”* without qualification. ++ ++**Example local config (explicit):** ++ ++```yaml ++platform: ++ runtime: docker ++ ++jobs: ++ enable_subprocess_executor: true # host steps for dev; training still uses cpu/gpu + container ++ executor_defaults: ++ docker: ++ launcher_tool_path: ./services/core/jobs/jobs-launcher/jobs-launcher ++ subprocess: ++ working_directory: /tmp/nmp-subprocess-jobs ++``` ++ ++Production / GPU-only deployments can set `enable_subprocess_executor: false` to avoid registering host execution while keeping `runtime: docker` for Automodel and other container jobs. ++ ++--- ++ ++## Current state (reference) ++ ++### Legacy Customizer (`Platform/services/customizer/`) ++ ++- **API**: `CustomizationJobInput` / `CustomizationJobOutput` via `job_route_factory` (`api/v2/jobs/endpoints.py`). ++- **Compiler**: `platform_job_config_compiler` builds a **4-step** `PlatformJobSpec`: ++ 1. `nmp.customizer.tasks.file_io` (CPU) — download model + datasets ++ 2. Training (GPU) — backend selected in training compiler (`automodel` \| `nemo_rl` \| `megatron_bridge`) ++ 3. `file_io` upload ++ 4. `nmp.customizer.tasks.model_entity` — register model in Models service ++- **Automodel backend**: `tasks/training/backends/automodel/` — `compile_automodel_config()`, `AutomodelBackend`, `finetune.py` (wraps `nemo_automodel` recipes + `TrainingProgressCallback`). ++- **Image**: `customizer-automodel` (see `nmp/docker/Dockerfile.nmp-customizer`); contract tests in `Platform/tests/customizer-automodel-contract/`. ++- **Progress “callbacks”**: `JobsServiceProgressReporter` + `TrainingProgressCallback` call Jobs internal task API (rank-0 only). ++ ++### Platform plugin patterns (`Platform/plugins/`) ++ ++- Entry points: training plugins use `nemo.customization.contributors`; **`nemo-customizer-plugin`** uses `nemo.services` + `nemo.cli` key `customization`; jobs via `nemo.jobs` (`customization..`). ++- Jobs: `NemoJob` + `add_job_routes()` (`nemo_platform_plugin.jobs.routes`). ++- Reference plugins: `nemo-evaluator` (service + job scaffold), `nemo-data-designer` (CLI config file → spec), `nemo-agents` (service + multiple routers). ++ ++### Simplified config shape (already validated) ++ ++Contract input JSONs under `Platform/tests/customizer-automodel-contract/input_configs/` are a good starting point for the **CLI/API simplified spec** (e.g. `llama_3_2_1b_lora.json`): `model`, `dataset`, `training`, `schedule`, `batch`, `optimizer`, `parallelism`, `output_model`, optional `seed`. ++ ++--- ++ ++## Target architecture ++ ++```mermaid ++flowchart TB ++ subgraph surfaces [Plugin surfaces] ++ CLI["nemo customization automodel jobs submit -w WS job.json"] ++ API["POST .../v2/workspaces/WS/automodel/jobs"] ++ SDK["client...jobs.create(workspace=WS)"] ++ end ++ ++ subgraph router [nemo-customizer] ++ CUST["CustomizationRouterService"] ++ MERGE["merge contributors"] ++ end ++ ++ subgraph plugin [plugins/nemo-automodel] ++ CONTrib["AutomodelContributor"] ++ JOB["AutomodelJob\n(NemoJob.compile)"] ++ CLI_MOD["automodel CLI subgroup"] ++ end ++ ++ subgraph pkg [services/automodel — library only, no HTTP server] ++ CORE["compile_spec / validate"] ++ TASK_TRAIN["tasks/training\n(automodel only)"] ++ TASK_IO["tasks/file_io"] ++ TASK_ME["tasks/model_entity"] ++ end ++ ++ subgraph deploy [platform.runtime docker] ++ JOBS["Jobs service"] ++ DOCKER["cpu/gpu steps → Docker"] ++ MODELS["Models service"] ++ FILES["Files service"] ++ end ++ ++ CLI --> CUST ++ API --> CUST ++ SDK --> CUST ++ CUST --> MERGE --> CONTrib ++ CONTrib --> JOB ++ CONTrib --> CLI_MOD ++ CONTrib --> CORE ++ JOB -->|compile PlatformJobSpec| JOBS ++ JOBS --> DOCKER ++ DOCKER --> TASK_IO ++ DOCKER --> TASK_TRAIN ++ DOCKER --> TASK_ME ++ TASK_TRAIN -->|internal tasks API| JOBS ++ TASK_ME --> MODELS ++ TASK_IO --> FILES ++``` ++ ++### Package layout (proposed) ++ ++``` ++Platform/ ++ plugins/nemo-customizer/ # router + contributor protocol (v1) ++ pyproject.toml ++ src/nemo_customizer/ ++ router.py # CustomizationRouterService (nemo.services → customization) ++ cli.py # CustomizationCLI ++ contributor.py # re-export CustomizationContributor from nemo_platform_plugin ++ discovery.py # re-export discover_customization_contributors ++ docs/CUSTOMIZATION.md # contributor author guide ++ ++ plugins/nemo-automodel/ ++ SCOPE.md # this file ++ pyproject.toml ++ src/nemo_automodel_plugin/ ++ contributor.py # AutomodelContributor (routers + CLI subgroup) ++ cli.py + cli/inputs.py # JSON config → spec ++ config.py # NemoConfig (image names, defaults) ++ schema.py # AutomodelJobInput, AutomodelJobOutput, sub-models ++ jobs/ ++ jobs.py # AutomodelJob (compile + optional local run) ++ sdk/ # optional hub resources ++ docs/ ++ ++ services/automodel/ # Python package nmp-automodel — tasks/compiler only (no HTTP server) ++ pyproject.toml ++ src/nmp/automodel/ ++ config.py ++ platform_client.py # model entity fetch (from customizer) ++ app/jobs/ ++ compiler.py # Automodel-only PlatformJobSpec (4 steps, slim) ++ training/ ++ compiler.py # single GPU step ++ schemas.py ++ file_io/ # port or thin wrapper from customizer ++ model_entity/ # port from customizer ++ tasks/ ++ training/backends/automodel/ # port: config, backend, finetune, callbacks ++ file_io/ ++ model_entity/ ++ docker/ ++ Dockerfile.nmp-automodel-training # GPU: nmp-automodel-base + finetune ++ Dockerfile.nmp-automodel-tasks # CPU: file_io / model_entity glue (slimmer) ++ tests/ ++ ++``` ++ ++**Dependency rule**: ++ ++| Package | Depends on | Provides | ++|---------|------------|----------| ++| **`nemo-customizer-plugin`** | `nemo-platform-plugin` | Router service/CLI; `CustomizationContributor` protocol and `discover_customization_contributors()` live in **`nemo_platform_plugin`** | ++| **`nemo-automodel`** (plugin) | `nemo-platform-plugin`, `nmp-automodel` (+ `nemo-customizer-plugin` at runtime via `enabled-plugins`) | `AutomodelContributor`, schemas; Step 5 `cli/inputs.py` optional | ++| **`nmp-automodel`** (service) | `nmp-common`, platform SDK types | Compilers, task entrypoints, Dockerfiles | ++ ++Avoid pulling entire legacy `nmp-customizer`. **`nemo-platform-plugin`** holds the contributor protocol and discovery (IGW-aligned); **`nemo-customizer-plugin`** holds only the router service/CLI merge logic. ++ ++### Customization router (in scope — v1) ++ ++**Problem:** `discover_services()` maps `nemo.services` entry-point **keys** 1:1 to mounted apps (`/apis//...`). Only one plugin can own `customization`. A monolithic customizer is out; multiple training backends (Automodel, RL, Megatron, Unsloth) must share one URL tree without boxing future plugins into Automodel’s package. ++ ++**Solution:** **`nemo-customizer-plugin`** ships **`CustomizationRouterService`** as the sole `nemo.services` registration for `customization`. Training plugins register as **contributors** via a new entry-point group; they do **not** register their own top-level `nemo.services` key. ++ ++| Piece | Owner | Registration | ++|-------|--------|----------------| ++| `/apis/customization/...` mount | `nemo-customizer-plugin` | `nemo.services` → `customization` = `CustomizationRouterService` | ++| Automodel routes | `nemo-automodel` plugin | `nemo.customization.contributors` → `automodel` = `AutomodelContributor` | ++| Future RL / Megatron / Unsloth | Each backend’s plugin | Same group, distinct keys: `rl`, `megatron`, `unsloth`, … | ++| Task/compiler library | `nmp-automodel` | No HTTP; imported by plugin + Jobs task images | ++ ++**Contributor contract** (protocol in `nemo_customizer.contributor`): ++ ++```python ++class CustomizationContributor(Protocol): ++ """One training backend under /apis/customization.""" ++ ++ name: ClassVar[str] # must match entry-point key, e.g. "automodel" ++ ++ def get_routers(self) -> list[RouterSpec]: ++ """e.g. prefix v2/workspaces/{workspace}/automodel + job routes.""" ++ ++ def get_cli(self) -> typer.Typer | None: ++ """Subgroup mounted at `nemo customization `.""" ++``` ++ ++SDK: **`nemo-customizer-plugin`** registers `nemo.sdk` → `customization` and composes per-contributor SDK modules (e.g. `nemo_automodel_plugin.sdk.resources` → `client.customization.automodel`). ++``` ++ ++**Router behavior:** ++ ++1. `discover_customization_contributors()` loads all `nemo.customization.contributors` entry points (fault-isolated, allowlist via `NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST` or `NEMO_PLUGIN_ALLOWLIST`). ++2. If **zero** contributors load, **fail startup** with a clear configuration error (router enabled but no backends). ++3. `CustomizationRouterService.get_routers()` concatenates each contributor’s `RouterSpec` list (stable sort by `name`); `dependencies` = union of contributor + platform service deps (`merge_router_dependencies()` at router startup). ++4. `CustomizationCLI.get_cli()` builds `typer.Typer(name="customization")` and mounts each contributor subgroup (`automodel`, …). ++5. OpenAPI / SDK generation includes the merged tree under service name `customization` only. ++6. **No route collision:** each contributor owns a distinct path segment after `.../workspaces/{workspace}/` (Automodel → `automodel`; legacy multi-backend `jobs` stays unmounted until a contributor revives it intentionally). ++ ++**Automodel plugin wiring (v1):** ++ ++| Surface | Entry point | Notes | ++|---------|-------------|--------| ++| HTTP | `nemo.customization.contributors.automodel` | **Not** `nemo.services` — router owns the mount | ++| Jobs | `nemo.jobs` → `customization.automodel.jobs` | Unchanged | ++| CLI | Via contributor `get_cli()` | `nemo customization automodel jobs ...` | ++| SDK | `nemo-customizer-plugin` → `nemo.sdk:customization` composes contributor SDKs | `client.customization.automodel.jobs` | ++| Tasks | `nmp-automodel` package | No server | ++ ++**pyproject.toml (Automodel plugin):** ++ ++```toml ++[project.entry-points."nemo.customization.contributors"] ++automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" ++ ++[project.entry-points."nemo.jobs"] ++"customization.automodel.jobs" = "nemo_automodel_plugin.jobs.jobs:AutomodelJob" ++``` ++ ++**`nemo-customizer-plugin` pyproject.toml:** ++ ++```toml ++[project.entry-points."nemo.services"] ++customization = "nemo_customizer.router:CustomizationRouterService" ++ ++[project.entry-points."nemo.cli"] ++customization = "nemo_customizer.cli:CustomizationCLI" ++ ++[project.entry-points."nemo.sdk"] ++customization = "nemo_customizer.sdk.resources:customization_sdk_resources" ++``` ++ ++Enable in platform workspace / `enabled-plugins` alongside `nemo-automodel`. ++ ++**Implementation checklist:** [Step 1 — `nemo-customizer-plugin`](#step-1--nemo-customizer-blocks-automodel-http). ++ ++### URL routing (decided) ++ ++Platform mounts the router at `/apis/customization//...`. Automodel contributor prefix: ++ ++| Piece | Value | ++|-------|--------| ++| Router `NemoService.name` | `customization` | ++| Contributor key | `automodel` | ++| Automodel `RouterSpec.prefix` | `v2/workspaces/{workspace}/automodel` | ++| Example job create | `POST /apis/customization/v2/workspaces/{workspace}/automodel/jobs` | ++| Legacy (deprecated) | `POST /apis/customization/v2/workspaces/{workspace}/jobs` — **not registered** in v1 | ++ ++**No `/train/` segment:** Flat `/jobs` under `.../automodel/` (`NemoJob.job_collection_path = "/jobs"`). ++ ++| Job wiring | Value | ++|----------|--------| ++| `NemoJob.job_collection_path` | `"/jobs"` | ++| `NemoJob.name` | `"jobs"` (CLI/SDK subgroup suffix only) | ++| `nemo.jobs` entry key | `customization.automodel.jobs` | ++| `add_job_routes(..., service_name=)` | **`"customization"`** (required; sets Jobs `source` + filters) | ++| `generate_job_name` | **`generate_automodel_id`** → `automodel-{hex}` | ++| `route_options` | **`[JobRouteOption.CORE]`** (no pause/resume v1) | ++| `training.execution_profile` | Spec field → GPU step profile; default from `NMP_AUTOMODEL_DEFAULT_TRAINING_EXECUTION_PROFILE` | ++| Request `profile` body | **Deferred** — use spec field until `BaseJobRequest` plumbing lands | ++ ++Do **not** register `nemo.services` = `automodel` (would split the product URL tree). ++ ++**Contributor job mount (reference):** ++ ++```python ++from nmp.common.jobs.api_factory import JobRouteOption ++from nemo_platform_plugin.jobs.routes import add_job_routes ++ ++def get_routers(self) -> list[RouterSpec]: ++ return [ ++ RouterSpec( ++ prefix="v2/workspaces/{workspace}/automodel", ++ router=add_job_routes( ++ AutomodelJob, ++ service_name="customization", ++ generate_job_name=generate_automodel_id, ++ route_options=[JobRouteOption.CORE], ++ default_profile=plugin_config.default_training_execution_profile, ++ ), ++ ), ++ ] ++``` ++ ++**CLI:** `nemo customization automodel jobs submit job.json` — router CLI + Automodel contributor subgroup. ++ ++### Workspace scoping (required) ++ ++All Automodel resources are scoped to a **platform workspace** (tenant/project boundary). The workspace is carried on the URL path for HTTP, on CLI/SDK calls for clients, and in job/task runtime env — it is **not** a separate top-level field in the simplified job JSON body. ++ ++#### API routes (full pattern) ++ ++Mount prefix: `v2/workspaces/{workspace}/automodel` → base: ++ ++`/apis/customization/v2/workspaces/{workspace}/automodel` ++ ++| Operation | Method | Path (after base) | ++|-----------|--------|-------------------| ++| Create job | `POST` | `/jobs` | ++| List jobs | `GET` | `/jobs` | ++| Get job | `GET` | `/jobs/{job_name}` | ++| Delete job | `DELETE` | `/jobs/{job_name}` | ++| Job results | `GET` | `/jobs/{job_name}/results/...` | ++ ++Example: ++ ++```http ++POST /apis/customization/v2/workspaces/acme-corp/automodel/jobs ++Content-Type: application/json ++ ++{ "model": "llama-3-8b-base", "dataset": { ... }, ... } ++``` ++ ++`acme-corp` is the scope for: authz checks, Jobs service record, Models/Filesets entities, and compiled fileset `workspace` fields. ++ ++#### Workspace in the job spec (body vs path) ++ ++| Source | Role | ++|--------|------| ++| **Path `{workspace}`** | Authoritative scope for the job and all entities created by it (output model, output fileset, Jobs record). | ++| **Spec `model`** | Model entity **name** in the path workspace, or qualified `other-workspace/model-name` for cross-workspace reads (same as legacy `CustomizationJobInput.model`). | ++| **Spec `dataset`** | `{ training: "name" }` or `{ training: "workspace/name" }` — bare names resolve in the path workspace (no `fileset://` prefix). | ++| **Spec `output.name`** | New or updated `ModelEntity` **in the path workspace** only. | ++| **Body `workspace` field** | **Do not add** — avoids conflicting with the path param. | ++ ++`compile(workspace, spec, ...)` and `to_spec(..., workspace=...)` receive the path workspace from `add_job_routes` / `job_route_factory` (same contract as `nemo_platform_plugin.jobs.routes`). ++ ++#### CLI ++ ++Auto-generated `submit` / `run` include `--workspace` / `-w` (default `"default"`). Custom wrappers must **forward** it to the framework callback: ++ ++```bash ++nemo customization automodel jobs submit job.json --workspace acme-corp ++nemo customization automodel jobs submit job.json -w acme-corp ++# execution profile: set training.execution_profile in job.json (request --profile body deferred) ++``` ++ ++Submit URL (see `nemo_platform_plugin.commands` job submit helper): ++ ++`/apis/customization/v2/workspaces/{workspace}/automodel/jobs` ++ ++i.e. `/apis/{NemoService.name}/{RouterSpec.prefix}/...` with `name=customization` and prefix `v2/workspaces/{workspace}/automodel`. ++ ++#### SDK ++ ++Hub resources take `workspace` on every call (pattern: evaluator `client.evaluator...`): ++ ++```python ++job = client.customization.automodel.jobs.create( ++ workspace="acme-corp", ++ spec=AutomodelJobInput(...), ++) ++status = client.customization.automodel.jobs.retrieve( ++ workspace="acme-corp", ++ name=job.name, ++) ++``` ++ ++SDK must not default silently to a global namespace; document `workspace="default"` for local dev only. ++ ++#### Runtime (compiled job + tasks) ++ ++| Stage | Workspace usage | ++|-------|------------------| ++| **Compile** | `fetch_model_entity(spec.model, workspace, sdk)`; output fileset refs use `workspace=None` in compile JSON and are resolved at runtime to the job workspace (legacy customizer pattern). | ++| **Jobs service** | Job created in path workspace. | ++| **Task containers** | `NEMO_JOB_WORKSPACE` (and `JobContext.workspace` / `get_workspace()`) set from job; `model_entity` task creates entities in that workspace. | ++| **Progress callbacks** | `sdk.jobs.tasks.create_or_update(..., workspace=job_ctx.workspace, job=job_ctx.job_id, ...)`. | ++| **List/filter** | API list endpoints return only jobs in the path workspace. | ++ ++#### Tests ++ ++→ [Step 5](#step-5--cli-submit-path) (CLI `-w`), [Step 6](#step-6--tests--contract-continuity) (API, integration, workspace isolation). ++ ++--- ++ ++## Work breakdown ++ ++Phases map to [Implementation order](#implementation-order). **Checklists and step-level detail live in the steps above**; sections below add design reference (Option A wiring, Studio verification, JSON spec) without duplicating deliverables. ++ ++| Phase | Implementation step(s) | Topic | ++|-------|------------------------|--------| ++| 0 | [Step 0](#step-0--design-lock--platform-prerequisites) | Design lock, Jobs flag, schemas | ++| 1 | [Step 1](#step-1--nemo-customizer-blocks-automodel-http) | Customization router | ++| 2 | [Step 2](#step-2--nmp-automodel-package-core) | `nmp-automodel` compiler/tasks | ++| 3 | [Step 3](#step-3--container-images) | Docker images | ++| 4 | [Step 4](#step-4--nemo-automodel-plugin-contributor--job) | Automodel plugin + Docker gate | ++| 5 | [Step 5](#step-5--cli-submit-path) | CLI | ++| 6 | [Step 6](#step-6--tests--contract-continuity) | Tests | ++| 7 | [Step 7](#step-7--sdk-openapi-docs--rollout) | SDK / docs / deploy | ++| — | [Step 2](#step-2--nmp-automodel-package-core) (callbacks) | Internal Jobs task API (not public) | ++ ++### Phase 0 — Design lock ++ ++→ [Step 0](#step-0--design-lock--platform-prerequisites). Router design: [Customization router](#customization-router-in-scope--v1). ++ ++#### Input vs canonical spec — **decided: Option A** ++ ++On job **create**, the platform always: ++ ++1. Validates the POST body against **`AutomodelJobInput`** (`input_spec_schema`). ++2. Runs **`AutomodelJob.to_spec()`** → **`AutomodelJobOutput`** stored on the Jobs record (`spec_schema`). ++3. Runs **`compile()`** on the canonical output → `platform_spec` for execution. ++ ++Enrichment (auto output name/fileset, adapter vs model type, dataset ACL, model entity fetch) happens in step 2 — the Jobs service persists that result, not a post-compile rewrite. Rejected alternatives: single-schema POST (manual output fields), enrich-only-in-`compile()` (broken persistence), renamed input fields (unnecessary vs legacy). ++ ++**`AutomodelJob` wiring:** ++ ++```python ++class AutomodelJobInput(BaseModel): # POST body / CLI JSON ++ model: str # name or workspace/name ++ dataset: DatasetSpec # training + optional validation fileset URIs ++ training: TrainingSpec # includes training_type, execution_profile, ... ++ output: OutputRequest | None = None # optional name only ++ # @model_validator: reject "output_model" key with legacy error message ++ ++class AutomodelJobOutput(BaseModel): # stored spec + GET response shape ++ output: OutputResponse # required: name, fileset, type (model | adapter) ++ # ... enriched fields from input ... ++ ++ def validate_for_training(self) -> None: ++ # Port MoE / parallelism rules from CustomizationJobOutput ++ ++class AutomodelJob(NemoJob): ++ name = "jobs" ++ job_collection_path = "/jobs" ++ input_spec_schema = AutomodelJobInput ++ spec_schema = AutomodelJobOutput ++ dependencies = ["entities", "auth", "jobs", "secrets", "files", "models"] ++ ++ @classmethod ++ async def to_spec(cls, input_spec, *, workspace, entity_client, async_sdk, is_local): ++ # Port transform_input_to_output + check_dataset_access per fileset ++ ++ @classmethod ++ async def compile(cls, *, workspace, spec: AutomodelJobOutput, ...): ++ spec.validate_for_training() ++ # nmp.automodel.app.jobs.compiler → PlatformJobSpec (4 steps) ++``` ++ ++**Implementation notes:** ++ ++- Port source: `Platform/services/customizer/src/nmp/customizer/utils.py` (`transform_input_to_output`). ++- `to_spec()` generates `output.fileset`, infers `output.type`, runs `fetch_model_entity` + `check_dataset_access`. ++- `compile()` receives **`AutomodelJobOutput` only**; calls `validate_for_training()` before building `PlatformJobSpec`. ++- Mount via `add_job_routes(..., service_name="customization", generate_job_name=generate_automodel_id)` — [URL routing](#url-routing-decided). ++- **CLI JSON** = `AutomodelJobInput`. **`jobs explain`** exposes both schemas. ++ ++#### Deprecation — Platform spin-up and Studio (verified) ++ ++**Platform `AVAILABLE_SERVICES`** (`packages/nmp_platform_runner/src/nmp/platform_runner/registry.py`) does **not** include `customization` / `customizer`: ++ ++```18:33:packages/nmp_platform_runner/src/nmp/platform_runner/registry.py ++AVAILABLE_SERVICES: dict[str, str] = { ++ "hello-world": "nmp.hello_world.main:service", ++ "studio": "nmp.studio.main:service", ++ ... ++ "inference-gateway": "nmp.core.inference_gateway.main:service", ++} ++``` ++ ++`API_SERVICES` and `OPENAPI_SERVICES` likewise omit customization. Plugin services are merged at runtime via `discover_services()` (e.g. future `customization` from `nemo.services`), but the **legacy `nmp.customizer` microservice is not started** by default platform spin-up in this repo. ++ ++**Note:** The older `nmp/` tree still lists `"customization": "nmp.customizer.main:service"` in its copy of the registry — do not treat that as Platform default behavior. ++ ++**Studio today:** ++ ++| Signal | Status | ++|--------|--------| ++| `VITE_FF_CUSTOMIZER_ENABLED` | Default **`false`** (`featureFlags.ts`) | ++| Routes | Gated via `CUSTOMIZER_ENABLED` / `gateRoutes` — customization pages hidden when flag off | ++| Live API | Vendored hooks target `/apis/customization/v2/.../jobs`; comment states service removed and UI must not call at runtime | ++| Tests | MSW handlers in `mocks/handlers/customizer.ts`; `create-a-customization.spec.tsx` is **`describe.skip`** | ++ ++```8:9:Platform/web/packages/sdk/vendored/customizer/api.ts ++// Note: these hooks call /apis/customization/v2/... endpoints that won't exist while the customizer ++// service is removed. The customizer UI is feature-flagged off, so they should never be invoked at runtime. ++``` ++ ++**First PR implication:** Safe to register **`CustomizationRouterService`** plus **`AutomodelContributor`** without legacy `nmp-customizer`. Studio/SDK migration **out of scope**. ++ ++### Phase 1 — `nmp-automodel` package core ++ ++→ [Step 2](#step-2--nmp-automodel-package-core). Port table and deliverables are defined there. ++ ++### Phase 2 — Plugin surfaces ++ ++→ [Step 4](#step-4--nemo-automodel-plugin-contributor--job) + [Step 5](#step-5--cli-submit-path). Requires [Step 1](#step-1--nemo-customizer-blocks-automodel-http). ++ ++### Phase 3 — Docker enforcement & GPU validation ++ ++→ [Step 4](#step-4--nemo-automodel-plugin-contributor--job) (compile-time checks). Today `validate_gpu_available_for_docker` only runs when `runtime == DOCKER` and reserved GPU list is empty — extend for all Automodel jobs. ++ ++### Phase 4 — Container images ++ ++→ [Step 3](#step-3--container-images). ++ ++### Phase 5 — Internal Jobs callback path ++ ++→ [Step 2](#step-2--nmp-automodel-package-core) (not a new public route). Optional later: webhooks. ++ ++### Phase 6 — API & SDK polish ++ ++→ [Step 7](#step-7--sdk-openapi-docs--rollout). ++ ++### Phase 7 — Testing & contract continuity ++ ++→ [Step 6](#step-6--tests--contract-continuity). ++ ++### Phase 8 — Docs & rollout ++ ++→ [Step 7](#step-7--sdk-openapi-docs--rollout). ++ ++--- ++ ++## Simplified JSON spec (draft) — `AutomodelJobInput` only ++ ++POST body and CLI JSON file use **`AutomodelJobInput`** only. After create, GET returns **`AutomodelJobOutput`** with enriched `output` (fileset, type). Validated in the context of the path **`workspace`** (or CLI `-w`). Entity names below are relative to that workspace unless qualified as `other-ws/name`. ++ ++```json ++{ ++ "name": "optional-job-name", ++ "model": "llama-3-8b-base", ++ "dataset": { ++ "training": "my-sft-train", ++ "validation": "my-sft-val" ++ }, ++ "training": { ++ "training_type": "sft | distillation", ++ "finetuning_type": "lora | all_weights | lora_merged", ++ "lora": { "rank": 16, "alpha": 32, "merge": false, "target_modules": null }, ++ "max_seq_length": 2048, ++ "execution_profile": "gpu", ++ "teacher_model": "meta/llama-3.2-3b-instruct", ++ "distillation_ratio": 0.5, ++ "distillation_temperature": 1.0, ++ "teacher_precision": "bf16", ++ "offload_teacher": false ++ }, ++ "schedule": { "epochs": 1, "max_steps": 50, "val_check_interval": 25, "seed": 42 }, ++ "batch": { "global_batch_size": 8, "micro_batch_size": 1, "sequence_packing": false }, ++ "optimizer": { ++ "learning_rate": 5e-6, ++ "weight_decay": 0.01, ++ "warmup_steps": 0 ++ }, ++ "parallelism": { ++ "num_nodes": 1, ++ "num_gpus_per_node": 1, ++ "tensor_parallel_size": 1, ++ "pipeline_parallel_size": 1, ++ "context_parallel_size": 1 ++ }, ++ "output": { "name": "my-finetuned-model", "description": "optional" }, ++ "integrations": { ++ "wandb": { "enabled": true, "project": "my-project", "api_key_secret": "wandb-api-key" }, ++ "mlflow": null ++ } ++} ++``` ++ ++**Validation rules:** ++ ++- **`output_model` is rejected** at parse time (legacy: *"spec.output_model was removed. Use spec.output instead."*). ++- `teacher_model`, `distillation_*`, and `offload_teacher` only when `training_type` is `distillation` (omit for `sft`). ++- Optional `dataset.prompt_template` for non-chat prompt/completion data (chat datasets use tokenizer chat template — document in README). ++- Compiler may accept additional optimizer/parallelism fields required by contract JSONs even if omitted from this minimal example (`adam_beta1`, `expert_parallel_size`, …). ++ ++**Training types in v1:** ++ ++| `training_type` | Automodel recipe | Notes | ++|-----------------|------------------|-------| ++| `sft` | `TrainFinetuneRecipeForNextTokenPrediction` | Default; LoRA / all_weights / lora_merged | ++| `distillation` | `KnowledgeDistillationRecipeForNextTokenPrediction` | Requires `teacher_model`; maps to Automodel `teacher_model`, `kd_ratio`, `kd_loss_fn` ([`nemo_automodel/recipes/llm/kd.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/kd.py), example [`examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml)) | ++ ++**KD / distillation fields** (when `training_type: distillation`): mirror legacy Customizer API — `teacher_model` (entity ref in path workspace), `distillation_ratio` (→ `kd_ratio`, default `0.5`), `distillation_temperature` (→ `kd_loss_fn.temperature`, default `1.0`), `teacher_precision` (default `bf16`), optional `offload_teacher` (→ `offload_teacher_model`). Compiler port: `_configure_kd()` in legacy `automodel/config.py`. Validate tokenizer compatibility student/teacher before submit. ++ ++**Explicitly out of scope (v1):** DPO, GRPO, `nemo_rl`, `megatron_bridge`, quantized LoRA, DoRA, **embedding-model SFT** (`embed_1b` / biencoder recipe), **`deployment_config`** (post-train NIM deploy), request-body **`profile`** on job create (use `training.execution_profile`). ++ ++**Compiler responsibilities** (unchanged from legacy): ++ ++1. Resolve `model` → `ModelEntity` in **path workspace** (or explicit `ws/name` ref). ++2. Resolve dataset filesets in **path workspace** → local paths in download step. ++3. `compile_automodel_config()` → YAML/JSON for `finetune.py`. ++4. Generate output fileset + `ModelEntityTaskConfig` with `workspace` = path workspace (output model and fileset live in that workspace). ++ ++--- ++ ++## Risk & complexity notes ++ ++| Topic | Note | ++|-------|------| ++| **Largest port** | `compile_automodel_config()` (~800 LOC) and `validate_for_training()` (MoE/parallelism). `deployment_config` and embedding SFT are **out of scope v1**. | ++| **Shared code** | File I/O and model_entity tasks are backend-agnostic — candidate for `nmp-common` or small `nmp-training-tasks` lib later; v1 can duplicate to ship faster. | ++| **Python version** | NGC automodel uses 3.12; platform pins 3.11 for API — task image runs 3.12 (existing customizer pattern). | ++| **KD / distillation** | In v1 JSON as `training_type: distillation`; compiler maps to Automodel KD recipe (see simplified JSON section). | ++| **Customizer service** | Remains in repo but unused; avoid dual registration in `NMP_SERVICES`. | ++| **Studio cutover** | **Out of scope** — no feature flag or Studio migration in Automodel v1; `VITE_FF_CUSTOMIZER_ENABLED` stays off. | ++| **Customization router** | v1 in scope: **`nemo-customizer-plugin`** (`CustomizationRouterService` + contributor protocol); Automodel first contributor; RL/Megatron/Unsloth add contributors later without new `/apis/*` services. | ++| **`runtime` vs subprocess flag** | `platform.runtime: docker` enables Docker-backed job profiles; `jobs.enable_subprocess_executor` separately controls host subprocess. Automodel training requires the former, not the latter. | ++ ++--- ++ ++## Non-critical follow-ups (post-v1) ++ ++Merged into [Implementation order](#implementation-order) and [Decisions](#decisions-resolved). Remaining items are not blocking the first PR: ++ ++| Topic | Notes | ++|-------|--------| ++| **`nemo.customization.contributors` in `_ALL_SURFACE_GROUPS`** | **Done:** `nemo_platform_plugin.discovery` — manifests + `discover_customization_contributors()` (IGW-aligned). | ++| **Request-body `profile` on job create** | Platform follow-up MR on `BaseJobRequest` + `add_job_routes`; until then CLI `--profile` may only map to `training.execution_profile` in JSON. | ++| **`custom_fields` passthrough** | Factory already supports; document if customers rely on it. | ++| **Full optimizer / MoE parallelism in public JSON** | Compiler + contracts may need fields beyond the minimal example; expand OpenAPI as contract port discovers gaps. | ++| **Chat dataset contract tests** | Port `*_full_sft_chat.json` when `prompt_template` behavior is documented. | ++ ++--- ++ ++## Decisions (resolved) ++ ++| # | Topic | Decision | ++|---|--------|----------| ++| 1 | **Service vs plugin-only** | **No standalone `nmp-automodel` HTTP server.** Automodel HTTP lives on **`AutomodelContributor`** merged by **`nemo-customizer-plugin`** (`CustomizationRouterService`) at `/apis/customization`. `nmp-automodel` is compiler + tasks only. | ++| 2 | **KD / distillation** | **Include in v1** simplified JSON when `training_type: distillation`. Map to [Automodel KD recipe](https://github.com/NVIDIA-NeMo/Automodel/tree/main/nemo_automodel/recipes/llm/kd.py) (`teacher_model`, `kd_ratio`, `kd_loss_fn`, optional `offload_teacher_model`). Port legacy `_configure_kd()` / `DistillationConfig` from customizer automodel backend. | ++| 3 | **Image naming** | **`nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-training`** (GPU) and **`.../nmp/automodel-tasks`** (CPU). Do **not** reuse `customizer-automodel` or the upstream `nvcr.io/nvidia/nemo-automodel` image name. | ++| 4 | **Workspace package name** | **`nmp-automodel`** (PyPI upstream library remains `nemo-automodel` / NGC image name unchanged). | ++| 5 | **Studio cutover** | **Punted** — no Studio feature flag or migration to `.../automodel/...` in this scope. | ++| 6 | **`customization` owner** | **In scope v1:** dedicated **`nemo-customizer-plugin`** owns `nemo.services` key `customization`; backends register via **`nemo.customization.contributors`** (Automodel first; RL / Megatron / Unsloth later). `nemo-automodel` must **not** register `nemo.services` directly. | ++| 7 | **`enable_subprocess_executor` on K8s** | **Default `false` on Kubernetes**; explicit `true` only when dev clusters need host subprocess. Default `true` for `platform.runtime: docker` local dev. | ++| 8 | **Jobs `source` / naming** | **`service_name="customization"`** on `add_job_routes` (never default `nemo-automodel-plugin`). Auto names: **`automodel-{hex}`** via `generate_automodel_id`. | ++| 9 | **`execution_profile` v1** | In **`training.execution_profile`** on job spec; default from **`NMP_AUTOMODEL_DEFAULT_TRAINING_EXECUTION_PROFILE`**. Request-body `profile` on create — **deferred** (platform gap). | ++| 10 | **Embedding SFT** | **Out of scope v1** (causal LM + KD only); `embed_1b` contracts gated in Step 6 until product expands. | ++| 11 | **`deployment_config`** | **Out of scope v1** (post-train NIM deploy; Studio-adjacent). | ++| 12 | **Router zero contributors** | **Fail startup** if customization plugin is enabled but no `nemo.customization.contributors` load. | ++ +diff --git a/plugins/nemo-automodel/pyproject.toml b/plugins/nemo-automodel/pyproject.toml +new file mode 100644 +index 000000000..126816f9d +--- /dev/null ++++ b/plugins/nemo-automodel/pyproject.toml +@@ -0,0 +1,51 @@ ++[project] ++name = "nemo-automodel-plugin" ++version = "0.1.0" ++description = "NeMo Automodel customization contributor for NeMo Platform." ++readme = "README.md" ++requires-python = ">=3.11,<3.14" ++dependencies = [ ++ "nemo-platform-plugin", ++ "nemo-platform", ++ "nmp-automodel", ++ "pydantic>=2.10.6", ++ "pydantic-settings>=2.6.1", ++ "typer>=0.12.5", ++] ++ ++[project.entry-points."nemo.customization.contributors"] ++automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" ++ ++[project.entry-points."nemo.jobs"] ++"customization.automodel.jobs" = "nemo_automodel_plugin.jobs.jobs:AutomodelJob" ++ ++[build-system] ++requires = ["hatchling"] ++build-backend = "hatchling.build" ++ ++[tool.hatch.build.targets.wheel] ++packages = ["src/nemo_automodel_plugin"] ++ ++[tool.uv.sources] ++nemo-platform-plugin = { workspace = true } ++nemo-platform = { workspace = true } ++nmp-automodel = { workspace = true } ++nemo-customizer-plugin = { workspace = true } ++ ++[dependency-groups] ++dev = [ ++ "pytest>=8.3.4", ++ "pytest-asyncio>=0.25.3", ++ "ruff>=0.11.8", ++ "fastapi>=0.115.0", ++ "httpx>=0.27.0", ++ "nemo-customizer-plugin", ++] ++ ++[tool.pytest.ini_options] ++asyncio_mode = "auto" ++pythonpath = ["src"] ++testpaths = ["tests"] ++ ++[tool.pyright] ++extraPaths = ["src"] +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/__init__.py +new file mode 100644 +index 000000000..7cca7f911 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/__init__.py +@@ -0,0 +1,4 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""NeMo Automodel customization contributor.""" +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/__init__.py +new file mode 100644 +index 000000000..4b22fee8f +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/__init__.py +@@ -0,0 +1,9 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel contributor CLI helpers.""" ++ ++from nemo_automodel_plugin.cli.inputs import apply_automodel_job_cli_overrides, load_job_json ++from nemo_automodel_plugin.cli.main import AutomodelContributorCLI ++ ++__all__ = ["AutomodelContributorCLI", "apply_automodel_job_cli_overrides", "load_job_json"] +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/inputs.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/inputs.py +new file mode 100644 +index 000000000..fce5a36fb +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/inputs.py +@@ -0,0 +1,99 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""CLI overrides: submit/run accept a job JSON file instead of ``--spec``.""" ++ ++from __future__ import annotations ++ ++import json ++from collections.abc import Callable ++from pathlib import Path ++ ++import typer ++ ++from nemo_automodel_plugin.schema import AutomodelJobInput ++ ++_JOB_JSON_HELP = "Path to Automodel job JSON (AutomodelJobInput schema)." ++ ++ ++def load_job_json(path: Path) -> str: ++ """Load and validate job JSON; return canonical JSON string for ``--spec``.""" ++ data = json.loads(path.read_text()) ++ validated = AutomodelJobInput.model_validate(data) ++ return validated.model_dump_json() ++ ++ ++def apply_automodel_job_cli_overrides(group: typer.Typer) -> None: ++ """Flat ``automodel`` CLI: ``submit JOB.json``; ``run`` is disabled.""" ++ _replace_job_run_disabled(group) ++ _replace_job_submit(group) ++ ++ ++def _pluck_callback(group: typer.Typer, verb: str) -> Callable[..., None]: ++ callback = next(c for c in group.registered_commands if c.name == verb).callback ++ if callback is None: ++ raise RuntimeError(f"missing {verb!r} callback to override") ++ return callback ++ ++ ++def _drop_command(group: typer.Typer, name: str) -> None: ++ group.registered_commands = [c for c in group.registered_commands if c.name != name] ++ ++ ++def _replace_job_run_disabled(group: typer.Typer) -> None: ++ _drop_command(group, "run") ++ ++ @group.command("run") ++ def run( ++ typer_ctx: typer.Context, ++ job_json: Path | None = typer.Argument( ++ None, ++ metavar="JOB_JSON", ++ help=_JOB_JSON_HELP, ++ ), ++ ) -> None: ++ del typer_ctx, job_json ++ typer.secho( ++ "Automodel does not support local run. Submit to the platform API instead:\n" ++ " nemo customization automodel submit -w ", ++ err=True, ++ fg=typer.colors.RED, ++ ) ++ raise typer.Exit(code=1) ++ ++ ++def _replace_job_submit(group: typer.Typer) -> None: ++ original = _pluck_callback(group, "submit") ++ ++ @group.command("submit") ++ def submit( ++ typer_ctx: typer.Context, ++ job_json: Path = typer.Argument(..., metavar="JOB_JSON", help=_JOB_JSON_HELP), ++ workspace: str = typer.Option("default", "--workspace", "-w", help="Target workspace."), ++ profile: str | None = typer.Option(None, "--profile"), ++ cluster: str | None = typer.Option(None, "--cluster"), ++ base_url: str | None = typer.Option( ++ None, ++ "--base-url", ++ help=( ++ "Override platform API host. If omitted: --cluster, then CLI context, " ++ "then $NMP_BASE_URL, then http://localhost:8080." ++ ), ++ ), ++ options: list[str] = typer.Option([], "-o", help="Backend option override, 'backend.key=value'."), ++ options_file: Path | None = typer.Option(None, "--options-file"), ++ ) -> None: ++ spec_json = load_job_json(job_json) ++ original( ++ typer_ctx, ++ spec=spec_json, ++ spec_file=None, ++ options=options, ++ options_file=options_file, ++ profile=profile, ++ cluster=cluster, ++ base_url=base_url, ++ workspace=workspace, ++ config=None, ++ config_file=None, ++ ) +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/main.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/main.py +new file mode 100644 +index 000000000..e0d2072d7 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/main.py +@@ -0,0 +1,20 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""CLI hooks for the Automodel customization contributor.""" ++ ++from __future__ import annotations ++ ++import typer ++from nemo_platform_plugin.job import NemoJob ++ ++from nemo_automodel_plugin.cli.inputs import apply_automodel_job_cli_overrides ++from nemo_automodel_plugin.jobs.jobs import AutomodelJob ++ ++ ++class AutomodelContributorCLI: ++ """Passed to ``add_job_commands`` to override job submit/run with job-file args.""" ++ ++ def update_job_cli(self, job_cls: type[NemoJob], group: typer.Typer) -> None: ++ if job_cls is AutomodelJob: ++ apply_automodel_job_cli_overrides(group) +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/config.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/config.py +new file mode 100644 +index 000000000..502909e71 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/config.py +@@ -0,0 +1,31 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Plugin configuration for Automodel training.""" ++ ++from __future__ import annotations ++ ++import os ++ ++from pydantic_settings import BaseSettings, SettingsConfigDict ++ ++ ++class AutomodelPluginConfig(BaseSettings): ++ """Environment-driven Automodel plugin settings.""" ++ ++ model_config = SettingsConfigDict(env_prefix="NMP_AUTOMODEL_", extra="ignore") ++ ++ default_training_execution_profile: str = "gpu" ++ training_image: str = "nmp/automodel-training" ++ tasks_image: str = "nmp/automodel-tasks" ++ ++ ++def get_config() -> AutomodelPluginConfig: ++ return AutomodelPluginConfig() ++ ++ ++def generate_automodel_id() -> str: ++ """Generate a job name when the submitter omits ``name``.""" ++ import uuid ++ ++ return f"automodel-{uuid.uuid4().hex[:12]}" +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/contributor.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/contributor.py +new file mode 100644 +index 000000000..1a073533c +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/contributor.py +@@ -0,0 +1,76 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel customization contributor.""" ++ ++from __future__ import annotations ++ ++from typing import ClassVar ++ ++import typer ++from fastapi import APIRouter ++from nemo_platform_plugin.jobs.routes import add_job_routes ++from nemo_platform_plugin.service import RouterSpec ++from nmp.common.jobs.api_factory import JobRouteOption ++ ++from nemo_automodel_plugin.config import generate_automodel_id, get_config ++from nemo_automodel_plugin.jobs.jobs import AutomodelJob ++ ++ ++class AutomodelContributor: ++ """Registers Automodel routes under the customization router.""" ++ ++ name: ClassVar[str] = "automodel" ++ dependencies: ClassVar[list[str]] = ["entities", "auth", "jobs", "secrets", "files", "models"] ++ ++ def get_routers(self) -> list[RouterSpec]: ++ config = get_config() ++ router = APIRouter() ++ ++ @router.get("/healthz") ++ async def healthz() -> dict[str, str]: ++ return {"backend": self.name, "status": "ok"} ++ ++ jobs_router = add_job_routes( ++ AutomodelJob, ++ service_name="customization", ++ generate_job_name=generate_automodel_id, ++ route_options=[JobRouteOption.CORE], ++ default_profile=config.default_training_execution_profile, ++ ) ++ ++ return [ ++ RouterSpec( ++ router=router, ++ prefix="/v2/workspaces/{workspace}/automodel", ++ tag="Automodel", ++ description="Automodel contributor health.", ++ ), ++ RouterSpec( ++ router=jobs_router, ++ prefix="/v2/workspaces/{workspace}", ++ tag="Automodel Jobs", ++ description="Automodel training jobs.", ++ ), ++ ] ++ ++ def get_cli(self) -> typer.Typer: ++ from nemo_automodel_plugin.cli.inputs import apply_automodel_job_cli_overrides ++ from nemo_platform_plugin.commands import ( ++ _add_explain_command, ++ _add_run_command, ++ _add_submit_command, ++ ) ++ from nemo_platform_plugin.scheduler import NemoJobScheduler ++ ++ app = typer.Typer( ++ name=self.name, ++ help="Automodel training jobs (SFT, distillation).", ++ no_args_is_help=True, ++ ) ++ scheduler = NemoJobScheduler() ++ _add_run_command(app, AutomodelJob, scheduler) ++ _add_submit_command(app, AutomodelJob, scheduler) ++ _add_explain_command(app, AutomodelJob, scheduler) ++ apply_automodel_job_cli_overrides(app) ++ return app +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/__init__.py +new file mode 100644 +index 000000000..e5725ea5a +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/__init__.py +@@ -0,0 +1,2 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/jobs.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/jobs.py +new file mode 100644 +index 000000000..1b1023787 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/jobs.py +@@ -0,0 +1,97 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel training job (NemoJob).""" ++ ++from __future__ import annotations ++ ++from typing import ClassVar, cast ++ ++from nemo_platform import AsyncNeMoPlatform ++from nemo_platform_plugin.config import NemoPlatformConfig, Runtime ++from nemo_platform_plugin.job import NemoJob ++from nemo_platform_plugin.jobs.api_factory import PlatformJobSpec ++from nemo_platform_plugin.jobs.docker import validate_gpu_available_for_docker ++from nemo_platform_plugin.jobs.exceptions import PlatformJobCompilationError ++from nmp.automodel.app.jobs.compiler import platform_job_config_compiler ++from pydantic import BaseModel ++ ++from nemo_automodel_plugin.config import get_config ++from nemo_automodel_plugin.schema import AutomodelJobInput, AutomodelJobOutput ++from nemo_automodel_plugin.transform import transform_input_to_output ++ ++ ++def _require_docker_runtime() -> None: ++ platform_config = NemoPlatformConfig.get() ++ if platform_config.runtime != Runtime.DOCKER: ++ raise PlatformJobCompilationError( ++ "Automodel training requires platform.runtime: docker with GPU-backed container execution.", ++ ) ++ from nemo_platform_plugin.config import validate_docker_available ++ ++ if not validate_docker_available(): ++ raise PlatformJobCompilationError( ++ "Automodel training requires a reachable Docker daemon (platform.runtime: docker).", ++ ) ++ ++ ++class AutomodelJob(NemoJob): ++ """GPU Automodel fine-tuning job under the customization router.""" ++ ++ name: ClassVar[str] = "jobs" ++ description: ClassVar[str] = "Automodel SFT and knowledge-distillation training jobs." ++ job_collection_path: ClassVar[str | None] = "/automodel/jobs" ++ input_spec_schema: ClassVar[type[BaseModel] | None] = AutomodelJobInput ++ spec_schema: ClassVar[type[BaseModel] | None] = AutomodelJobOutput ++ dependencies: ClassVar[list[str]] = ["entities", "auth", "jobs", "secrets", "files", "models"] ++ ++ @classmethod ++ async def to_spec( ++ cls, ++ input_spec: BaseModel, ++ workspace: str, ++ entity_client: object, ++ async_sdk: object, ++ is_local: bool, ++ ) -> AutomodelJobOutput: ++ del entity_client, is_local ++ job_input = ( ++ input_spec ++ if isinstance(input_spec, AutomodelJobInput) ++ else AutomodelJobInput.model_validate(input_spec.model_dump()) ++ ) ++ return await transform_input_to_output(job_input, workspace, cast(AsyncNeMoPlatform, async_sdk)) ++ ++ @classmethod ++ async def compile( ++ cls, ++ workspace: str, ++ spec: BaseModel, ++ entity_client: object, ++ job_name: str | None, ++ async_sdk: object, ++ profile: str | None = None, ++ options: dict | None = None, ++ ) -> PlatformJobSpec: ++ del entity_client, options ++ _require_docker_runtime() ++ canonical = ( ++ spec if isinstance(spec, AutomodelJobOutput) else AutomodelJobOutput.model_validate(spec.model_dump()) ++ ) ++ canonical.validate_for_training() ++ ++ plugin_config = get_config() ++ execution_profile = ( ++ canonical.training.execution_profile or profile or plugin_config.default_training_execution_profile ++ ) ++ ++ platform_spec = await platform_job_config_compiler( ++ canonical, ++ workspace, ++ cast(AsyncNeMoPlatform, async_sdk), ++ job_name=job_name, ++ profile=execution_profile, ++ ) ++ ++ validate_gpu_available_for_docker(platform_spec) ++ return platform_spec +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py +new file mode 100644 +index 000000000..30a9ef587 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py +@@ -0,0 +1,240 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel job input/output schemas (simplified JSON v1).""" ++ ++from __future__ import annotations ++ ++from typing import Any, Literal, Self ++ ++from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator ++ ++ ++class LoRAParams(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ rank: int = Field(default=16, gt=0) ++ alpha: int = Field(default=32, gt=0) ++ merge: bool = False ++ target_modules: list[str] | None = None ++ ++ ++class DatasetSpec(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ training: str = Field(description="Training fileset as 'name' or 'workspace/name'.") ++ validation: str | None = None ++ prompt_template: str | None = None ++ ++ ++class TrainingSpec(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ training_type: Literal["sft", "distillation"] = "sft" ++ finetuning_type: Literal["lora", "all_weights", "lora_merged"] = "lora" ++ lora: LoRAParams | None = None ++ max_seq_length: int = Field(default=2048, gt=0) ++ execution_profile: str | None = Field(default=None, min_length=1) ++ teacher_model: str | None = None ++ distillation_ratio: float = Field(default=0.5, ge=0.0, le=1.0) ++ distillation_temperature: float = Field(default=1.0, gt=0.0) ++ teacher_precision: Literal["bf16", "fp16", "fp32"] = "bf16" ++ offload_teacher: bool = False ++ ++ @model_validator(mode="after") ++ def _training_type_fields(self) -> Self: ++ if self.training_type == "distillation" and not self.teacher_model: ++ raise ValueError("teacher_model is required when training_type is distillation") ++ if self.finetuning_type.startswith("lora") and self.lora is None: ++ self.lora = LoRAParams() ++ return self ++ ++ ++class ScheduleSpec(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ epochs: int = Field(default=1, gt=0) ++ max_steps: int | None = Field(default=None, gt=0) ++ val_check_interval: float | None = None ++ seed: int | None = None ++ ++ ++class BatchSpec(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ global_batch_size: int = Field(default=8, gt=0) ++ micro_batch_size: int = Field(default=1, gt=0) ++ sequence_packing: bool = False ++ ++ ++class OptimizerSpec(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ learning_rate: float = Field(default=5e-6, gt=0.0) ++ weight_decay: float = Field(default=0.01, ge=0.0) ++ warmup_steps: int = Field(default=0, ge=0) ++ ++ ++class ParallelismSpec(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ num_nodes: int = Field(default=1, gt=0) ++ num_gpus_per_node: int = Field(default=1, gt=0) ++ tensor_parallel_size: int = Field(default=1, gt=0) ++ pipeline_parallel_size: int = Field(default=1, gt=0) ++ context_parallel_size: int = Field(default=1, gt=0) ++ expert_parallel_size: int | None = Field(default=None, gt=0) ++ ++ ++class OutputRequest(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ name: str ++ description: str | None = None ++ ++ ++class OutputResponse(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ name: str ++ type: Literal["model", "adapter"] ++ fileset: str ++ description: str | None = None ++ ++ ++class WandbIntegration(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ enabled: bool = True ++ project: str | None = None ++ api_key_secret: str | None = None ++ ++ ++class IntegrationsSpec(BaseModel): ++ model_config = ConfigDict(extra="forbid") ++ ++ wandb: WandbIntegration | None = None ++ mlflow: dict[str, Any] | None = None ++ ++ ++class AutomodelJobInput(BaseModel): ++ """POST body / CLI JSON.""" ++ ++ model_config = ConfigDict(extra="forbid") ++ ++ name: str | None = None ++ model: str ++ dataset: DatasetSpec ++ training: TrainingSpec ++ schedule: ScheduleSpec = Field(default_factory=ScheduleSpec) ++ batch: BatchSpec = Field(default_factory=BatchSpec) ++ optimizer: OptimizerSpec = Field(default_factory=OptimizerSpec) ++ parallelism: ParallelismSpec = Field(default_factory=ParallelismSpec) ++ output: OutputRequest | None = None ++ integrations: IntegrationsSpec | None = None ++ ++ @model_validator(mode="before") ++ @classmethod ++ def reject_legacy_fields(cls, data: object) -> object: ++ if isinstance(data, dict) and "output_model" in data: ++ raise ValueError("spec.output_model was removed. Use spec.output instead.") ++ return data ++ ++ ++class AutomodelJobOutput(BaseModel): ++ """Stored canonical spec after ``to_spec()``.""" ++ ++ model_config = ConfigDict(extra="forbid") ++ ++ name: str | None = None ++ model: str ++ dataset: DatasetSpec ++ training: TrainingSpec ++ schedule: ScheduleSpec ++ batch: BatchSpec ++ optimizer: OptimizerSpec ++ parallelism: ParallelismSpec ++ output: OutputResponse ++ integrations: IntegrationsSpec | None = None ++ ++ def validate_for_training(self) -> None: ++ """MoE / parallelism constraints (ported from legacy CustomizationJobOutput).""" ++ p = self.parallelism ++ num_nodes = p.num_nodes ++ num_gpus_per_node = p.num_gpus_per_node ++ tp = p.tensor_parallel_size ++ pp = p.pipeline_parallel_size ++ cp = p.context_parallel_size ++ ep = p.expert_parallel_size ++ ++ total_gpus = num_gpus_per_node * num_nodes ++ model_parallel_size = tp * pp * cp ++ if total_gpus % model_parallel_size != 0: ++ raise ValidationError.from_exception_data( ++ "parallelism", ++ [ ++ { ++ "type": "value_error", ++ "loc": ("parallelism",), ++ "msg": ( ++ f"Total GPUs ({total_gpus}) must be divisible by " ++ f"tensor_parallel_size ({tp}) * pipeline_parallel_size ({pp}) * " ++ f"context_parallel_size ({cp}) = {model_parallel_size}" ++ ), ++ "input": p.model_dump(), ++ } ++ ], ++ ) ++ ++ derived_dp = total_gpus // model_parallel_size ++ gb = self.batch.global_batch_size ++ mb = self.batch.micro_batch_size ++ divisor = mb * derived_dp ++ if gb % divisor != 0: ++ raise ValidationError.from_exception_data( ++ "batch", ++ [ ++ { ++ "type": "value_error", ++ "loc": ("batch", "global_batch_size"), ++ "msg": ( ++ f"global_batch_size ({gb}) must be divisible by " ++ f"micro_batch_size ({mb}) * data_parallel_size ({derived_dp}) = {divisor}" ++ ), ++ "input": gb, ++ } ++ ], ++ ) ++ ++ if ep is not None: ++ dp_cp = derived_dp * cp ++ if dp_cp % ep != 0: ++ raise ValidationError.from_exception_data( ++ "parallelism", ++ [ ++ { ++ "type": "value_error", ++ "loc": ("parallelism", "expert_parallel_size"), ++ "msg": ( ++ f"(data_parallel_size * context_parallel_size) ({dp_cp}) " ++ f"must be divisible by expert_parallel_size ({ep})" ++ ), ++ "input": ep, ++ } ++ ], ++ ) ++ if ep > 1 and tp > 1 and total_gpus > 1: ++ raise ValidationError.from_exception_data( ++ "parallelism", ++ [ ++ { ++ "type": "value_error", ++ "loc": ("parallelism", "tensor_parallel_size"), ++ "msg": ( ++ f"Tensor parallelism (tensor_parallel_size={tp}) is not supported for MoE models " ++ f"when expert_parallel_size > 1 ({ep}); tensor_parallel_size must be 1." ++ ), ++ "input": tp, ++ } ++ ], ++ ) +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/__init__.py +new file mode 100644 +index 000000000..0b9ecd889 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/__init__.py +@@ -0,0 +1,18 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel contributor SDK (mounted under ``client.customization`` by nemo-customizer).""" ++ ++from nemo_automodel_plugin.sdk.resources import ( ++ AsyncAutomodelCustomization, ++ AsyncAutomodelJobsResource, ++ AutomodelCustomization, ++ AutomodelJobsResource, ++) ++ ++__all__ = [ ++ "AsyncAutomodelCustomization", ++ "AsyncAutomodelJobsResource", ++ "AutomodelCustomization", ++ "AutomodelJobsResource", ++] +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/http_utils.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/http_utils.py +new file mode 100644 +index 000000000..1efbc31fc +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/http_utils.py +@@ -0,0 +1,63 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Shared HTTP helpers for Automodel customization SDK resources.""" ++ ++from __future__ import annotations ++ ++from typing import Any ++from urllib.parse import quote, urljoin ++ ++from nemo_automodel_plugin.schema import AutomodelJobInput ++from nemo_platform import AsyncNeMoPlatform, NeMoPlatform ++ ++PlatformClient = NeMoPlatform | AsyncNeMoPlatform ++ ++_API_PREFIX = "/apis/customization" ++_JOBS_COLLECTION = "v2/workspaces/{workspace}/automodel/jobs" ++ ++ ++def base_url(source: str) -> str: ++ """Return the normalized base URL for a raw URL string.""" ++ return source.rstrip("/") ++ ++ ++def resolve_workspace(platform: PlatformClient, workspace: str | None, strict: bool = False) -> str: ++ """Return the explicit, platform, or default workspace for customization routes.""" ++ resolved = workspace or platform.workspace ++ if resolved is None: ++ if strict: ++ raise ValueError("workspace must be provided when the client has no default workspace") ++ return "default" ++ return resolved ++ ++ ++def url(platform: PlatformClient, path: str, workspace: str | None = None) -> str: ++ """Build a full customization plugin API URL for the provided route path.""" ++ resolved_path = path.format(workspace=quote(resolve_workspace(platform, workspace), safe="")) ++ return _join_url(str(platform.base_url), f"{_API_PREFIX}/{resolved_path}") ++ ++ ++def jobs_collection_url(platform: PlatformClient, workspace: str | None = None) -> str: ++ """URL for the Automodel jobs collection in a workspace.""" ++ return url(platform, _JOBS_COLLECTION, workspace) ++ ++ ++def job_url(platform: PlatformClient, job_name: str, workspace: str | None = None) -> str: ++ """URL for a single Automodel job.""" ++ return _join_url(jobs_collection_url(platform, workspace), quote(job_name, safe="")) ++ ++ ++def platform_default_headers(platform: PlatformClient) -> dict[str, str]: ++ """Return string-valued default platform headers for direct HTTP calls.""" ++ return {str(key): value for key, value in platform.default_headers.items() if isinstance(value, str)} ++ ++ ++def create_job_payload(spec: AutomodelJobInput) -> dict[str, dict[str, Any]]: ++ """Serialize an Automodel job creation request body.""" ++ return {"spec": spec.model_dump(mode="json")} ++ ++ ++def _join_url(root: str, relative_path: str) -> str: ++ """Join a root URL and a relative path using URL parsing rules.""" ++ return urljoin(f"{base_url(root)}/", relative_path.lstrip("/")) +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/job_resources.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/job_resources.py +new file mode 100644 +index 000000000..1a6b27936 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/job_resources.py +@@ -0,0 +1,86 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel job resources for status polling via the customization plugin API.""" ++ ++from __future__ import annotations ++ ++from typing import Any ++from urllib.parse import quote ++ ++from nemo_automodel_plugin.sdk import http_utils ++from nemo_platform import AsyncNeMoPlatform, NeMoPlatform ++from nemo_platform_plugin.jobs.schemas import PlatformJobStatusResponse ++from pydantic import BaseModel ++ ++ ++class AutomodelJobRecord(BaseModel): ++ """Minimal job record returned by the customization Automodel jobs API.""" ++ ++ name: str ++ workspace: str ++ status: str | None = None ++ spec: dict[str, Any] | None = None ++ ++ ++class AutomodelJobResource: ++ """Sync handle for one submitted Automodel job.""" ++ ++ def __init__( ++ self, ++ job: AutomodelJobRecord, ++ http_client: Any, ++ base_url: str, ++ workspace: str, ++ headers: dict[str, str], ++ ) -> None: ++ self.job = job ++ self._http_client = http_client ++ self._base_url = base_url ++ self._workspace = workspace ++ self._headers = headers ++ ++ def get_status(self) -> PlatformJobStatusResponse: ++ """Fetch current job status.""" ++ response = self._http_client.get( ++ _job_status_path(self._base_url, self._workspace, self.job.name), ++ headers=self._headers, ++ ) ++ response.raise_for_status() ++ return PlatformJobStatusResponse.model_validate(response.json()) ++ ++ ++class AsyncAutomodelJobResource: ++ """Async handle for one submitted Automodel job.""" ++ ++ def __init__( ++ self, ++ job: AutomodelJobRecord, ++ http_client: Any, ++ base_url: str, ++ workspace: str, ++ headers: dict[str, str], ++ ) -> None: ++ self.job = job ++ self._http_client = http_client ++ self._base_url = base_url ++ self._workspace = workspace ++ self._headers = headers ++ ++ async def get_status(self) -> PlatformJobStatusResponse: ++ """Fetch current job status.""" ++ response = await self._http_client.get( ++ _job_status_path(self._base_url, self._workspace, self.job.name), ++ headers=self._headers, ++ ) ++ response.raise_for_status() ++ return PlatformJobStatusResponse.model_validate(response.json()) ++ ++ ++def _job_status_path(base_url: str, workspace: str, job_name: str) -> str: ++ encoded_workspace = quote(workspace, safe="") ++ encoded_job = quote(job_name, safe="") ++ return ( ++ f"{http_utils.base_url(base_url)}/apis/customization/v2/workspaces/" ++ f"{encoded_workspace}/automodel/jobs/{encoded_job}" ++ ) +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/resources.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/resources.py +new file mode 100644 +index 000000000..e79283d0e +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/resources.py +@@ -0,0 +1,163 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel contributor SDK resources (composed by ``nemo-customizer-plugin``).""" ++ ++from __future__ import annotations ++ ++from typing import Any ++ ++from nemo_automodel_plugin.schema import AutomodelJobInput ++from nemo_automodel_plugin.sdk import http_utils ++from nemo_automodel_plugin.sdk.job_resources import ( ++ AsyncAutomodelJobResource, ++ AutomodelJobRecord, ++ AutomodelJobResource, ++) ++from nemo_platform import AsyncNeMoPlatform, NeMoPlatform ++ ++ ++class AutomodelJobsResource: ++ """Sync SDK namespace at ``client.customization.automodel.jobs``.""" ++ ++ def __init__(self, platform: NeMoPlatform) -> None: ++ self._platform = platform ++ self._http_client = platform._client ++ ++ def plugin_status(self) -> dict[str, object]: ++ """Return Automodel contributor health from the customization service.""" ++ response = self._http_client.get( ++ http_utils.url( ++ self._platform, ++ "v2/workspaces/{workspace}/automodel/healthz", ++ self._platform.workspace, ++ ), ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ response.raise_for_status() ++ payload = response.json() ++ if not isinstance(payload, dict): ++ raise TypeError("Automodel health response must be a JSON object.") ++ return {str(key): value for key, value in payload.items()} ++ ++ def create( ++ self, ++ spec: AutomodelJobInput, ++ workspace: str | None = None, ++ name: str | None = None, ++ ) -> AutomodelJobResource: ++ """Submit an Automodel training job.""" ++ body: dict[str, Any] = http_utils.create_job_payload(spec) ++ if name is not None: ++ body["name"] = name ++ response = self._http_client.post( ++ http_utils.jobs_collection_url(self._platform, workspace), ++ json=body, ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ response.raise_for_status() ++ record = AutomodelJobRecord.model_validate(response.json()) ++ resolved_ws = http_utils.resolve_workspace(self._platform, workspace) ++ return AutomodelJobResource( ++ job=record, ++ http_client=self._http_client, ++ base_url=http_utils.base_url(str(self._platform.base_url)), ++ workspace=resolved_ws, ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ ++ def get_job_resource(self, job_name: str, workspace: str | None = None) -> AutomodelJobResource: ++ """Get a resource handle for an existing Automodel job.""" ++ resolved_ws = http_utils.resolve_workspace(self._platform, workspace) ++ response = self._http_client.get( ++ http_utils.job_url(self._platform, job_name, resolved_ws), ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ response.raise_for_status() ++ return AutomodelJobResource( ++ job=AutomodelJobRecord.model_validate(response.json()), ++ http_client=self._http_client, ++ base_url=http_utils.base_url(str(self._platform.base_url)), ++ workspace=resolved_ws, ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ ++ ++class AsyncAutomodelJobsResource: ++ """Async SDK namespace at ``client.customization.automodel.jobs``.""" ++ ++ def __init__(self, platform: AsyncNeMoPlatform) -> None: ++ self._platform = platform ++ self._http_client = platform._client ++ ++ async def plugin_status(self) -> dict[str, object]: ++ """Return Automodel contributor health from the customization service.""" ++ response = await self._http_client.get( ++ http_utils.url( ++ self._platform, ++ "v2/workspaces/{workspace}/automodel/healthz", ++ self._platform.workspace, ++ ), ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ response.raise_for_status() ++ payload = response.json() ++ if not isinstance(payload, dict): ++ raise TypeError("Automodel health response must be a JSON object.") ++ return {str(key): value for key, value in payload.items()} ++ ++ async def create( ++ self, ++ spec: AutomodelJobInput, ++ workspace: str | None = None, ++ name: str | None = None, ++ ) -> AsyncAutomodelJobResource: ++ """Submit an Automodel training job.""" ++ body: dict[str, Any] = http_utils.create_job_payload(spec) ++ if name is not None: ++ body["name"] = name ++ response = await self._http_client.post( ++ http_utils.jobs_collection_url(self._platform, workspace), ++ json=body, ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ response.raise_for_status() ++ record = AutomodelJobRecord.model_validate(response.json()) ++ resolved_ws = http_utils.resolve_workspace(self._platform, workspace) ++ return AsyncAutomodelJobResource( ++ job=record, ++ http_client=self._http_client, ++ base_url=http_utils.base_url(str(self._platform.base_url)), ++ workspace=resolved_ws, ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ ++ async def get_job_resource(self, job_name: str, workspace: str | None = None) -> AsyncAutomodelJobResource: ++ """Get a resource handle for an existing Automodel job.""" ++ resolved_ws = http_utils.resolve_workspace(self._platform, workspace) ++ response = await self._http_client.get( ++ http_utils.job_url(self._platform, job_name, resolved_ws), ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ response.raise_for_status() ++ return AsyncAutomodelJobResource( ++ job=AutomodelJobRecord.model_validate(response.json()), ++ http_client=self._http_client, ++ base_url=http_utils.base_url(str(self._platform.base_url)), ++ workspace=resolved_ws, ++ headers=http_utils.platform_default_headers(self._platform), ++ ) ++ ++ ++class AutomodelCustomization: ++ """Sync SDK namespace at ``client.customization.automodel``.""" ++ ++ def __init__(self, platform: NeMoPlatform) -> None: ++ self.jobs = AutomodelJobsResource(platform) ++ ++ ++class AsyncAutomodelCustomization: ++ """Async SDK namespace at ``client.customization.automodel``.""" ++ ++ def __init__(self, platform: AsyncNeMoPlatform) -> None: ++ self.jobs = AsyncAutomodelJobsResource(platform) +diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/transform.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/transform.py +new file mode 100644 +index 000000000..74518e4f8 +--- /dev/null ++++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/transform.py +@@ -0,0 +1,99 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Input → canonical spec transformation.""" ++ ++from __future__ import annotations ++ ++import uuid ++from typing import TYPE_CHECKING ++ ++from nemo_automodel_plugin.schema import ( ++ AutomodelJobInput, ++ AutomodelJobOutput, ++ OutputRequest, ++ OutputResponse, ++) ++from nmp.automodel.platform_client import check_dataset_access, fetch_model_entity ++from nmp.common.entities.utils import parse_entity_ref ++ ++if TYPE_CHECKING: ++ from nemo_platform import AsyncNeMoPlatform ++ ++_MAX_PREFIX_LEN = 50 ++_HEX_LEN = 12 ++ ++ ++def _random_suffix(prefix: str) -> str: ++ truncated = prefix[:_MAX_PREFIX_LEN].rstrip("-") ++ return f"{truncated}-{uuid.uuid4().hex[:_HEX_LEN]}" ++ ++ ++def _entity_basename(model_ref: str, workspace: str) -> str: ++ return parse_entity_ref(model_ref, workspace).name ++ ++ ++def _dataset_basename(uri: str) -> str: ++ normalized = uri ++ if normalized.startswith("fileset://"): ++ normalized = normalized[len("fileset://") :] ++ return parse_entity_ref(normalized, "default").name ++ ++ ++def _infer_output_type(input_spec: AutomodelJobInput, is_embedding_model: bool) -> str: ++ if is_embedding_model: ++ return "model" ++ lora = input_spec.training.lora ++ if input_spec.training.finetuning_type == "lora" and lora is not None and not lora.merge: ++ return "adapter" ++ return "model" ++ ++ ++async def transform_input_to_output( ++ input_spec: AutomodelJobInput, ++ workspace: str, ++ sdk: AsyncNeMoPlatform, ++) -> AutomodelJobOutput: ++ """Enrich submitter input into canonical AutomodelJobOutput.""" ++ model_entity = await fetch_model_entity(input_spec.model, workspace, sdk) ++ await check_dataset_access(sdk, input_spec.dataset.training, workspace) ++ if input_spec.dataset.validation: ++ await check_dataset_access(sdk, input_spec.dataset.validation, workspace) ++ ++ is_embedding = bool(model_entity.spec and getattr(model_entity.spec, "is_embedding_model", False)) ++ if is_embedding: ++ raise ValueError( ++ "Embedding-model SFT is not supported in Automodel v1. " ++ "Use a causal LM checkpoint or wait for a future release." ++ ) ++ ++ entity_name = _entity_basename(input_spec.model, workspace) ++ dataset_name = _dataset_basename(input_spec.dataset.training) ++ output_type = _infer_output_type(input_spec, is_embedding) ++ ++ if input_spec.output is None: ++ out_name = _random_suffix(f"{entity_name}-{dataset_name}") ++ fileset = out_name ++ else: ++ out_name = input_spec.output.name ++ fileset = out_name ++ ++ output = OutputResponse( ++ name=out_name, ++ type=output_type, # type: ignore[arg-type] ++ fileset=fileset, ++ description=input_spec.output.description if input_spec.output else None, ++ ) ++ ++ return AutomodelJobOutput( ++ name=input_spec.name, ++ model=input_spec.model, ++ dataset=input_spec.dataset, ++ training=input_spec.training, ++ schedule=input_spec.schedule, ++ batch=input_spec.batch, ++ optimizer=input_spec.optimizer, ++ parallelism=input_spec.parallelism, ++ output=output, ++ integrations=input_spec.integrations, ++ ) +diff --git a/plugins/nemo-automodel/tests/fixtures/minimal_sft_lora.json b/plugins/nemo-automodel/tests/fixtures/minimal_sft_lora.json +new file mode 100644 +index 000000000..8e71f7ba5 +--- /dev/null ++++ b/plugins/nemo-automodel/tests/fixtures/minimal_sft_lora.json +@@ -0,0 +1,30 @@ ++{ ++ "model": "meta/llama-test", ++ "dataset": { ++ "training": "default/train-data" ++ }, ++ "training": { ++ "training_type": "sft", ++ "finetuning_type": "lora", ++ "max_seq_length": 2048 ++ }, ++ "schedule": { ++ "epochs": 1, ++ "max_steps": 10 ++ }, ++ "batch": { ++ "global_batch_size": 8, ++ "micro_batch_size": 1 ++ }, ++ "optimizer": { ++ "learning_rate": 5e-6 ++ }, ++ "parallelism": { ++ "num_nodes": 1, ++ "num_gpus_per_node": 1, ++ "tensor_parallel_size": 1 ++ }, ++ "output": { ++ "name": "test-out" ++ } ++} +diff --git a/plugins/nemo-automodel/tests/test_api.py b/plugins/nemo-automodel/tests/test_api.py +new file mode 100644 +index 000000000..b5ca3e3b7 +--- /dev/null ++++ b/plugins/nemo-automodel/tests/test_api.py +@@ -0,0 +1,53 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++import pytest ++from fastapi import FastAPI ++from fastapi.testclient import TestClient ++from nemo_automodel_plugin.contributor import AutomodelContributor ++from nemo_customizer.router import CustomizationRouterService ++ ++ ++def _make_automodel_app() -> FastAPI: ++ app = FastAPI() ++ for spec in AutomodelContributor().get_routers(): ++ app.include_router(spec.router, prefix=spec.prefix, tags=[spec.tag] if spec.tag else None) ++ return app ++ ++ ++def test_automodel_healthz_under_workspace() -> None: ++ client = TestClient(_make_automodel_app()) ++ response = client.get("/v2/workspaces/test-ws/automodel/healthz") ++ assert response.status_code == 200 ++ assert response.json() == {"backend": "automodel", "status": "ok"} ++ ++ ++def test_automodel_jobs_collection_path() -> None: ++ paths = {route.path for route in _make_automodel_app().routes if hasattr(route, "path")} ++ assert "/v2/workspaces/{workspace}/automodel/jobs" in paths ++ ++ ++def test_customization_router_merges_automodel(monkeypatch: pytest.MonkeyPatch) -> None: ++ monkeypatch.setattr( ++ "nemo_customizer.router.discover_customization_contributors", ++ lambda: {"automodel": AutomodelContributor()}, ++ ) ++ service = CustomizationRouterService() ++ app = FastAPI() ++ for spec in service.get_routers(): ++ prefix = spec.prefix or "" ++ app.include_router(spec.router, prefix=prefix) ++ ++ client = TestClient(app) ++ assert client.get("/healthz").json()["contributors"] == ["automodel"] ++ assert client.get("/v2/workspaces/ws-a/automodel/healthz").status_code == 200 ++ ++ ++def test_workspace_isolation_list_uses_path_segment() -> None: ++ """Job routes are under ``/v2/workspaces/{workspace}/automodel/jobs`` — distinct per workspace.""" ++ app = _make_automodel_app() ++ paths = {route.path for route in app.routes if hasattr(route, "path")} ++ assert "/v2/workspaces/{workspace}/automodel/jobs" in paths ++ assert "/v2/workspaces/{workspace}/automodel/healthz" in paths +diff --git a/plugins/nemo-automodel/tests/test_cli.py b/plugins/nemo-automodel/tests/test_cli.py +new file mode 100644 +index 000000000..c4bef73e4 +--- /dev/null ++++ b/plugins/nemo-automodel/tests/test_cli.py +@@ -0,0 +1,126 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++import json ++from pathlib import Path ++ ++import httpx ++import pytest ++from nemo_automodel_plugin.cli.inputs import load_job_json ++from nemo_automodel_plugin.contributor import AutomodelContributor ++from nemo_automodel_plugin.jobs.jobs import AutomodelJob ++from nemo_platform_plugin.scheduler import NemoJobScheduler, submit_path_for ++from typer.testing import CliRunner ++ ++FIXTURES = Path(__file__).parent / "fixtures" ++ ++ ++def test_submit_path_includes_workspace() -> None: ++ path = submit_path_for(AutomodelJob, workspace="acme-corp") ++ assert path == "/apis/customization/v2/workspaces/acme-corp/automodel/jobs" ++ ++ ++def test_load_job_json_validates_fixture() -> None: ++ job_path = FIXTURES / "minimal_sft_lora.json" ++ spec = json.loads(load_job_json(job_path)) ++ assert spec["training"]["training_type"] == "sft" ++ assert spec["dataset"]["training"] == "default/train-data" ++ ++ ++def test_jobs_submit_posts_to_automodel_collection(monkeypatch: pytest.MonkeyPatch) -> None: ++ capture: dict = {} ++ ++ def handler(request: httpx.Request) -> httpx.Response: ++ capture["method"] = request.method ++ capture["url"] = str(request.url) ++ capture["body"] = json.loads(request.content) ++ return httpx.Response(200, json={"id": "job-1", "status": "queued"}) ++ ++ monkeypatch.setattr( ++ "nemo_platform_plugin.discovery.discover_jobs", ++ lambda: {"customization.automodel.jobs": AutomodelJob}, ++ ) ++ scheduler = NemoJobScheduler() ++ scheduler.submit_remote( ++ AutomodelJob, ++ json.loads(load_job_json(FIXTURES / "minimal_sft_lora.json")), ++ base_url="https://nmp.test", ++ workspace="ws-a", ++ http_client=httpx.Client(transport=httpx.MockTransport(handler)), ++ ) ++ ++ assert capture["method"] == "POST" ++ assert capture["url"] == "https://nmp.test/apis/customization/v2/workspaces/ws-a/automodel/jobs" ++ assert capture["body"]["spec"]["training"]["training_type"] == "sft" ++ ++ ++def test_cli_submit_accepts_job_json_file(monkeypatch: pytest.MonkeyPatch) -> None: ++ """Contributor CLI: ``submit JOB.json -w ws`` forwards workspace to submit_remote.""" ++ submitted: dict = {} ++ ++ def fake_submit_remote( ++ _scheduler, ++ job_cls: type, ++ spec_data: dict, ++ base_url: str | None, ++ workspace: str, ++ profile: str | None = None, ++ options: dict | None = None, ++ metadata: dict | None = None, ++ http_client: httpx.Client | None = None, ++ ) -> dict: ++ del _scheduler, job_cls, profile, options, metadata, http_client ++ submitted["workspace"] = workspace ++ submitted["spec"] = spec_data ++ submitted["base_url"] = base_url ++ return {"id": "job-99"} ++ ++ monkeypatch.setattr( ++ "nemo_platform_plugin.commands.NemoJobScheduler.submit_remote", ++ fake_submit_remote, ++ ) ++ monkeypatch.setattr( ++ "nemo_platform_plugin.discovery.discover_jobs", ++ lambda: {"customization.automodel.jobs": AutomodelJob}, ++ ) ++ ++ automodel_cli = AutomodelContributor().get_cli() ++ runner = CliRunner() ++ result = runner.invoke( ++ automodel_cli, ++ [ ++ "submit", ++ str(FIXTURES / "minimal_sft_lora.json"), ++ "--workspace", ++ "acme-corp", ++ "--base-url", ++ "https://nmp.test", ++ ], ++ ) ++ assert result.exit_code == 0, result.stdout + result.stderr ++ assert submitted["workspace"] == "acme-corp" ++ assert submitted["base_url"] == "https://nmp.test" ++ assert submitted["spec"]["model"] == "meta/llama-test" ++ ++ ++def test_cli_run_is_disabled() -> None: ++ automodel_cli = AutomodelContributor().get_cli() ++ runner = CliRunner() ++ result = runner.invoke(automodel_cli, ["run", str(FIXTURES / "minimal_sft_lora.json")]) ++ assert result.exit_code == 1 ++ assert "does not support local run" in result.stderr ++ ++ ++def test_cli_expose_input_and_output_schemas() -> None: ++ automodel_cli = AutomodelContributor().get_cli() ++ runner = CliRunner() ++ result = runner.invoke(automodel_cli, ["explain"]) ++ assert result.exit_code == 0 ++ payload = json.loads(result.stdout) ++ assert "input_spec_schema" in payload ++ assert "spec_schema" in payload ++ assert "/automodel/jobs" in payload["endpoint"] ++ ++ +diff --git a/plugins/nemo-automodel/tests/test_contributor.py b/plugins/nemo-automodel/tests/test_contributor.py +new file mode 100644 +index 000000000..6c1e540cd +--- /dev/null ++++ b/plugins/nemo-automodel/tests/test_contributor.py +@@ -0,0 +1,28 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++from fastapi import FastAPI ++from nemo_automodel_plugin.contributor import AutomodelContributor ++ ++ ++def test_contributor_mounts_job_collection() -> None: ++ contributor = AutomodelContributor() ++ app = FastAPI() ++ for spec in contributor.get_routers(): ++ app.include_router(spec.router, prefix=spec.prefix) ++ ++ paths = {route.path for route in app.routes if hasattr(route, "path")} ++ assert "/v2/workspaces/{workspace}/automodel/healthz" in paths ++ assert "/v2/workspaces/{workspace}/automodel/jobs" in paths ++ ++ ++def test_contributor_get_cli_exposes_flat_verbs() -> None: ++ import typer ++ ++ cli = AutomodelContributor().get_cli() ++ assert isinstance(cli, typer.Typer) ++ assert cli.info.name == "automodel" ++ assert not any(g.name == "jobs" for g in cli.registered_groups) ++ assert {cmd.name for cmd in cli.registered_commands} >= {"run", "submit", "explain"} +diff --git a/plugins/nemo-automodel/tests/test_schema.py b/plugins/nemo-automodel/tests/test_schema.py +new file mode 100644 +index 000000000..867250e4c +--- /dev/null ++++ b/plugins/nemo-automodel/tests/test_schema.py +@@ -0,0 +1,28 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import pytest ++from nemo_automodel_plugin.schema import AutomodelJobInput ++ ++ ++def test_reject_output_model() -> None: ++ with pytest.raises(ValueError, match="output_model"): ++ AutomodelJobInput.model_validate( ++ { ++ "model": "llama", ++ "dataset": {"training": "default/train"}, ++ "training": {"training_type": "sft"}, ++ "output_model": "old-field", ++ }, ++ ) ++ ++ ++def test_distillation_requires_teacher() -> None: ++ with pytest.raises(ValueError, match="teacher_model"): ++ AutomodelJobInput.model_validate( ++ { ++ "model": "llama", ++ "dataset": {"training": "default/train"}, ++ "training": {"training_type": "distillation"}, ++ }, ++ ) +diff --git a/plugins/nemo-customizer/README.md b/plugins/nemo-customizer/README.md +new file mode 100644 +index 000000000..927ceda3c +--- /dev/null ++++ b/plugins/nemo-customizer/README.md +@@ -0,0 +1,7 @@ ++# nemo-customizer ++ ++Router service for `/apis/customization`. Training backends (Automodel, RL, Megatron, …) register as **`nemo.customization.contributors`** entry points (discovered via `nemo_platform_plugin.discovery`). ++ ++Registers **`nemo.sdk`** → `customization` for `client.customization.*` (composes contributor SDK modules such as `client.customization.automodel.jobs`). ++ ++See [docs/CUSTOMIZATION.md](docs/CUSTOMIZATION.md) for contributor authoring. +diff --git a/plugins/nemo-customizer/docs/CUSTOMIZATION.md b/plugins/nemo-customizer/docs/CUSTOMIZATION.md +new file mode 100644 +index 000000000..dfb63576d +--- /dev/null ++++ b/plugins/nemo-customizer/docs/CUSTOMIZATION.md +@@ -0,0 +1,23 @@ ++# Customization contributor guide ++ ++Register a training backend under **`nemo.customization.contributors`** (not `nemo.services`). ++ ++## Contract ++ ++Implement `CustomizationContributor`: ++ ++- `name` — must match the entry-point key (e.g. `automodel`) ++- `get_routers()` — `RouterSpec` list with a **unique** prefix under `v2/workspaces/{workspace}//` ++- `get_cli()` — optional `typer.Typer` mounted at `nemo customization ` ++- SDK: contributors implement HTTP/CLI only; **`nemo-customizer-plugin`** owns `nemo.sdk` → `customization` and composes backends (e.g. `client.customization.automodel.jobs` from `nemo-automodel-plugin`) ++ ++## pyproject.toml ++ ++```toml ++[project.entry-points."nemo.customization.contributors"] ++automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" ++``` ++ ++## Jobs ++ ++Use `add_job_routes(YourJob, service_name="customization", ...)` so Jobs records use `source=customization`. +diff --git a/plugins/nemo-customizer/pyproject.toml b/plugins/nemo-customizer/pyproject.toml +new file mode 100644 +index 000000000..29003c8fa +--- /dev/null ++++ b/plugins/nemo-customizer/pyproject.toml +@@ -0,0 +1,43 @@ ++[project] ++name = "nemo-customizer-plugin" ++version = "0.1.0" ++description = "Customization router for NeMo Platform training backends." ++readme = "README.md" ++requires-python = ">=3.11,<3.14" ++dependencies = [ ++ "nemo-platform-plugin", ++ "nemo-platform", ++ "pydantic>=2.10.6", ++ "typer>=0.12.5", ++] ++ ++[project.entry-points."nemo.services"] ++customization = "nemo_customizer.router:CustomizationRouterService" ++ ++[project.entry-points."nemo.cli"] ++customization = "nemo_customizer.cli:CustomizationCLI" ++ ++[project.entry-points."nemo.sdk"] ++customization = "nemo_customizer.sdk.resources:customization_sdk_resources" ++ ++[build-system] ++requires = ["hatchling"] ++build-backend = "hatchling.build" ++ ++[tool.hatch.build.targets.wheel] ++packages = ["src/nemo_customizer"] ++ ++[tool.uv.sources] ++nemo-platform-plugin = { workspace = true } ++nemo-platform = { workspace = true } ++ ++[dependency-groups] ++dev = ["pytest>=8.3.4", "pytest-asyncio>=0.25.3", "ruff>=0.11.8", "fastapi>=0.115.0"] ++ ++[tool.pytest.ini_options] ++asyncio_mode = "auto" ++pythonpath = ["src"] ++testpaths = ["tests"] ++ ++[tool.pyright] ++extraPaths = ["src"] +diff --git a/plugins/nemo-customizer/src/nemo_customizer/__init__.py b/plugins/nemo-customizer/src/nemo_customizer/__init__.py +new file mode 100644 +index 000000000..66f8740d5 +--- /dev/null ++++ b/plugins/nemo-customizer/src/nemo_customizer/__init__.py +@@ -0,0 +1,12 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Customization router plugin for NeMo Platform.""" ++ ++from nemo_customizer.contributor import CustomizationContributor ++from nemo_customizer.discovery import discover_customization_contributors ++ ++__all__ = [ ++ "CustomizationContributor", ++ "discover_customization_contributors", ++] +diff --git a/plugins/nemo-customizer/src/nemo_customizer/cli.py b/plugins/nemo-customizer/src/nemo_customizer/cli.py +new file mode 100644 +index 000000000..f59751a00 +--- /dev/null ++++ b/plugins/nemo-customizer/src/nemo_customizer/cli.py +@@ -0,0 +1,43 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""CLI router for customization — mounts contributor subgroups.""" ++ ++from __future__ import annotations ++ ++from typing import ClassVar ++ ++import typer ++from nemo_platform_plugin.discovery import discover_customization_contributors ++from nemo_platform_plugin.cli import NemoCLI ++ ++ ++class CustomizationCLI(NemoCLI): ++ """``nemo customization`` root command.""" ++ ++ name: ClassVar[str] = "customization" ++ description: ClassVar[str] = "Customization training backends (Automodel, …)." ++ ++ def get_cli(self) -> typer.Typer: ++ app = typer.Typer( ++ name=self.name, ++ help=self.description, ++ no_args_is_help=True, ++ ) ++ ++ contributors = discover_customization_contributors() ++ if not contributors: ++ typer.echo( ++ "No customization contributors installed. " ++ "Add nemo-automodel (or another backend) to enabled-plugins.", ++ err=True, ++ ) ++ return app ++ ++ for key in sorted(contributors.keys()): ++ contributor = contributors[key] ++ subgroup = contributor.get_cli() ++ if subgroup is not None: ++ app.add_typer(subgroup, name=key) ++ ++ return app +diff --git a/plugins/nemo-customizer/src/nemo_customizer/contributor.py b/plugins/nemo-customizer/src/nemo_customizer/contributor.py +new file mode 100644 +index 000000000..67b51114e +--- /dev/null ++++ b/plugins/nemo-customizer/src/nemo_customizer/contributor.py +@@ -0,0 +1,8 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Re-export customization contributor protocol from nemo-platform-plugin.""" ++ ++from nemo_platform_plugin.customization_contributor import CustomizationContributor ++ ++__all__ = ["CustomizationContributor"] +diff --git a/plugins/nemo-customizer/src/nemo_customizer/discovery.py b/plugins/nemo-customizer/src/nemo_customizer/discovery.py +new file mode 100644 +index 000000000..390e76827 +--- /dev/null ++++ b/plugins/nemo-customizer/src/nemo_customizer/discovery.py +@@ -0,0 +1,16 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Re-export customization contributor discovery from nemo-platform-plugin.""" ++ ++from nemo_platform_plugin.discovery import ( ++ CUSTOMIZATION_CONTRIBUTORS_GROUP, ++ discover_customization_contributor_classes, ++ discover_customization_contributors, ++) ++ ++__all__ = [ ++ "CUSTOMIZATION_CONTRIBUTORS_GROUP", ++ "discover_customization_contributor_classes", ++ "discover_customization_contributors", ++] +diff --git a/plugins/nemo-customizer/src/nemo_customizer/router.py b/plugins/nemo-customizer/src/nemo_customizer/router.py +new file mode 100644 +index 000000000..addc683c8 +--- /dev/null ++++ b/plugins/nemo-customizer/src/nemo_customizer/router.py +@@ -0,0 +1,96 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Customization router service — merges contributor HTTP routes.""" ++ ++from __future__ import annotations ++ ++from typing import ClassVar ++ ++from fastapi import APIRouter ++from nemo_platform_plugin.discovery import ( ++ CUSTOMIZATION_CONTRIBUTORS_GROUP, ++ discover_customization_contributors, ++) ++from nemo_platform_plugin.service import NemoService, RouterSpec ++ ++ ++class CustomizationRouterError(RuntimeError): ++ """Raised when the customization router cannot start.""" ++ ++ ++_ROUTER_BASE_DEPENDENCIES = ("entities", "auth", "jobs", "secrets", "files", "models") ++ ++ ++def merge_router_dependencies(contributors: dict[str, object]) -> list[str]: ++ """Union platform router deps with each contributor's ``dependencies``.""" ++ deps = set(_ROUTER_BASE_DEPENDENCIES) ++ for contributor in contributors.values(): ++ contrib_deps = getattr(type(contributor), "dependencies", None) or [] ++ deps.update(contrib_deps) ++ return sorted(deps) ++ ++ ++def _assert_no_prefix_collisions(contributors: dict[str, object]) -> None: ++ prefixes: dict[str, str] = {} ++ for key, contributor in contributors.items(): ++ for spec in contributor.get_routers(): # type: ignore[union-attr] ++ prefix = spec.prefix.strip("/") ++ if prefix in prefixes: ++ raise CustomizationRouterError( ++ f"Route prefix collision: contributors {prefixes[prefix]!r} and {key!r} " ++ f"both use prefix {spec.prefix!r}", ++ ) ++ prefixes[prefix] = key ++ ++ ++class CustomizationRouterService(NemoService): ++ """Sole ``nemo.services`` owner for ``/apis/customization``.""" ++ ++ name: ClassVar[str] = "customization" ++ dependencies: ClassVar[list[str]] = list(_ROUTER_BASE_DEPENDENCIES) ++ ++ def __init__(self) -> None: ++ self._contributors = discover_customization_contributors() ++ if not self._contributors: ++ raise CustomizationRouterError( ++ "Customization router is enabled but no contributors were discovered. " ++ "Install a backend plugin (e.g. nemo-automodel) and ensure " ++ f"'{CUSTOMIZATION_CONTRIBUTORS_GROUP}' entry points are registered.", ++ ) ++ _assert_no_prefix_collisions(self._contributors) ++ type(self).dependencies = merge_router_dependencies(self._contributors) ++ ++ def get_routers(self) -> list[RouterSpec]: ++ router = APIRouter() ++ ++ @router.get("/healthz") ++ async def healthz() -> dict[str, object]: ++ return { ++ "plugin": self.name, ++ "status": "ok", ++ "contributors": sorted(self._contributors.keys()), ++ } ++ ++ specs: list[RouterSpec] = [ ++ RouterSpec( ++ router=router, ++ tag="Customization", ++ description="Customization router health.", ++ prefix="", ++ ), ++ ] ++ ++ for key in sorted(self._contributors.keys()): ++ contributor = self._contributors[key] ++ contributor_specs = contributor.get_routers() ++ for spec in contributor_specs: ++ specs.append( ++ RouterSpec( ++ router=spec.router, ++ tag=spec.tag or f"Customization {key}", ++ description=spec.description, ++ prefix=spec.prefix, ++ ), ++ ) ++ return specs +diff --git a/plugins/nemo-customizer/src/nemo_customizer/sdk/__init__.py b/plugins/nemo-customizer/src/nemo_customizer/sdk/__init__.py +new file mode 100644 +index 000000000..35c8a2a59 +--- /dev/null ++++ b/plugins/nemo-customizer/src/nemo_customizer/sdk/__init__.py +@@ -0,0 +1,16 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Customization router SDK (``nemo.sdk`` entry point ``customization``).""" ++ ++from nemo_customizer.sdk.resources import ( ++ AsyncCustomization, ++ Customization, ++ customization_sdk_resources, ++) ++ ++__all__ = [ ++ "AsyncCustomization", ++ "Customization", ++ "customization_sdk_resources", ++] +diff --git a/plugins/nemo-customizer/src/nemo_customizer/sdk/resources.py b/plugins/nemo-customizer/src/nemo_customizer/sdk/resources.py +new file mode 100644 +index 000000000..b8cc1fb99 +--- /dev/null ++++ b/plugins/nemo-customizer/src/nemo_customizer/sdk/resources.py +@@ -0,0 +1,74 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Customization SDK hub — composes contributor backends under ``client.customization``.""" ++ ++from __future__ import annotations ++ ++import importlib ++import logging ++from typing import Any ++ ++from nemo_platform import AsyncNeMoPlatform, NeMoPlatform ++from nemo_platform_plugin.discovery import discover_customization_contributors ++from nemo_platform_plugin.sdk import NemoPluginSDKResources ++ ++logger = logging.getLogger(__name__) ++ ++# Contributor entry-point key → (module, sync class, async class) ++_CONTRIBUTOR_SDK: dict[str, tuple[str, str, str]] = { ++ "automodel": ( ++ "nemo_automodel_plugin.sdk.resources", ++ "AutomodelCustomization", ++ "AsyncAutomodelCustomization", ++ ), ++} ++ ++ ++def _load_contributor_sdk_class(module_path: str, class_name: str) -> type[Any]: ++ module = importlib.import_module(module_path) ++ return getattr(module, class_name) ++ ++ ++class Customization: ++ """Sync SDK namespace mounted as ``client.customization``.""" ++ ++ def __init__(self, platform: NeMoPlatform) -> None: ++ contributors = discover_customization_contributors() ++ for key, (module_path, sync_cls, _async_cls) in _CONTRIBUTOR_SDK.items(): ++ if key not in contributors: ++ continue ++ try: ++ cls = _load_contributor_sdk_class(module_path, sync_cls) ++ setattr(self, key, cls(platform)) ++ except ImportError: ++ logger.warning( ++ "Customization contributor %r is installed but SDK module %s is missing", ++ key, ++ module_path, ++ ) ++ ++ ++class AsyncCustomization: ++ """Async SDK namespace mounted as ``client.customization``.""" ++ ++ def __init__(self, platform: AsyncNeMoPlatform) -> None: ++ contributors = discover_customization_contributors() ++ for key, (module_path, _sync_cls, async_cls) in _CONTRIBUTOR_SDK.items(): ++ if key not in contributors: ++ continue ++ try: ++ cls = _load_contributor_sdk_class(module_path, async_cls) ++ setattr(self, key, cls(platform)) ++ except ImportError: ++ logger.warning( ++ "Customization contributor %r is installed but SDK module %s is missing", ++ key, ++ module_path, ++ ) ++ ++ ++customization_sdk_resources = NemoPluginSDKResources( ++ sync_resource=Customization, ++ async_resource=AsyncCustomization, ++) +diff --git a/plugins/nemo-customizer/tests/test_customization_discovery_reexport.py b/plugins/nemo-customizer/tests/test_customization_discovery_reexport.py +new file mode 100644 +index 000000000..120a31fef +--- /dev/null ++++ b/plugins/nemo-customizer/tests/test_customization_discovery_reexport.py +@@ -0,0 +1,21 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++from nemo_customizer.discovery import ( ++ CUSTOMIZATION_CONTRIBUTORS_GROUP, ++ discover_customization_contributor_classes, ++ discover_customization_contributors, ++) ++from nemo_platform_plugin.discovery import ( ++ discover_customization_contributors as platform_discover, ++) ++ ++ ++def test_reexport_matches_platform_discovery() -> None: ++ assert discover_customization_contributors is platform_discover ++ assert CUSTOMIZATION_CONTRIBUTORS_GROUP == "nemo.customization.contributors" ++ discover_customization_contributors.cache_clear() ++ assert isinstance(discover_customization_contributors(), dict) ++ assert isinstance(discover_customization_contributor_classes(), dict) +diff --git a/plugins/nemo-customizer/tests/test_router.py b/plugins/nemo-customizer/tests/test_router.py +new file mode 100644 +index 000000000..1409fadab +--- /dev/null ++++ b/plugins/nemo-customizer/tests/test_router.py +@@ -0,0 +1,103 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++from typing import ClassVar ++ ++import pytest ++import typer ++from fastapi import APIRouter, FastAPI ++from fastapi.testclient import TestClient ++from nemo_customizer.router import ( ++ CustomizationRouterError, ++ CustomizationRouterService, ++ merge_router_dependencies, ++) ++from nemo_platform_plugin.service import RouterSpec ++ ++ ++class _FakeContributor: ++ name: ClassVar[str] = "fake" ++ dependencies: ClassVar[list[str]] = ["studio"] ++ ++ def get_routers(self) -> list[RouterSpec]: ++ router = APIRouter() ++ ++ @router.get("/ping") ++ async def ping() -> dict[str, str]: ++ return {"backend": "fake"} ++ ++ return [ ++ RouterSpec( ++ router=router, ++ prefix="/v2/workspaces/{workspace}/fake", ++ tag="Fake", ++ ), ++ ] ++ ++ def get_cli(self) -> typer.Typer: ++ app = typer.Typer() ++ ++ @app.command("info") ++ def info() -> None: ++ typer.echo("fake") ++ ++ return app ++ ++ ++def test_merge_router_dependencies_unions_contributor_deps() -> None: ++ deps = merge_router_dependencies({"fake": _FakeContributor()}) ++ assert "studio" in deps ++ assert "jobs" in deps ++ ++ ++def test_router_sets_merged_dependencies(monkeypatch: pytest.MonkeyPatch) -> None: ++ monkeypatch.setattr( ++ "nemo_customizer.router.discover_customization_contributors", ++ lambda: {"fake": _FakeContributor()}, ++ ) ++ service = CustomizationRouterService() ++ assert "studio" in CustomizationRouterService.dependencies ++ ++ ++def test_router_raises_without_contributors(monkeypatch: pytest.MonkeyPatch) -> None: ++ monkeypatch.setattr( ++ "nemo_customizer.router.discover_customization_contributors", ++ lambda: {}, ++ ) ++ with pytest.raises(CustomizationRouterError, match="no contributors"): ++ CustomizationRouterService() ++ ++ ++def test_router_merges_contributor_routes(monkeypatch: pytest.MonkeyPatch) -> None: ++ monkeypatch.setattr( ++ "nemo_customizer.router.discover_customization_contributors", ++ lambda: {"fake": _FakeContributor()}, ++ ) ++ service = CustomizationRouterService() ++ app = FastAPI() ++ for spec in service.get_routers(): ++ if spec.prefix: ++ app.include_router(spec.router, prefix=spec.prefix) ++ else: ++ app.include_router(spec.router) ++ ++ client = TestClient(app) ++ assert client.get("/healthz").json()["contributors"] == ["fake"] ++ assert client.get("/v2/workspaces/ws-a/fake/ping").json() == {"backend": "fake"} ++ ++ ++def test_prefix_collision_raises(monkeypatch: pytest.MonkeyPatch) -> None: ++ class _DupA(_FakeContributor): ++ name = "a" ++ ++ class _DupB(_FakeContributor): ++ name = "b" ++ ++ monkeypatch.setattr( ++ "nemo_customizer.router.discover_customization_contributors", ++ lambda: {"a": _DupA(), "b": _DupB()}, ++ ) ++ with pytest.raises(CustomizationRouterError, match="collision"): ++ CustomizationRouterService() +diff --git a/plugins/nemo-customizer/tests/test_sdk.py b/plugins/nemo-customizer/tests/test_sdk.py +new file mode 100644 +index 000000000..a430935d4 +--- /dev/null ++++ b/plugins/nemo-customizer/tests/test_sdk.py +@@ -0,0 +1,37 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++from unittest.mock import MagicMock, patch ++ ++from nemo_customizer.sdk.resources import ( ++ AsyncCustomization, ++ Customization, ++ customization_sdk_resources, ++) ++from nemo_platform_plugin.sdk import NemoPluginSDKResources ++ ++ ++def test_customization_sdk_resources_entry_point_shape() -> None: ++ assert isinstance(customization_sdk_resources, NemoPluginSDKResources) ++ assert customization_sdk_resources.sync_resource is Customization ++ assert customization_sdk_resources.async_resource is AsyncCustomization ++ ++ ++def test_customization_composes_automodel_when_contributor_present() -> None: ++ platform = MagicMock() ++ platform._client = MagicMock() ++ platform.workspace = "default" ++ platform.base_url = "http://localhost:8000" ++ platform.default_headers = {} ++ ++ fake_contributor = object() ++ with patch( ++ "nemo_customizer.sdk.resources.discover_customization_contributors", ++ return_value={"automodel": fake_contributor}, ++ ): ++ customization = Customization(platform) ++ ++ assert hasattr(customization, "automodel") ++ assert hasattr(customization.automodel, "jobs") +diff --git a/pyproject.toml b/pyproject.toml +index d51dd4928..61934cb58 100644 +--- a/pyproject.toml ++++ b/pyproject.toml +@@ -180,6 +180,8 @@ enabled-plugins = [ + "nemo-auditor-plugin", + "nemo-switchyard", + "nemo-agents-plugin", ++ "nemo-customizer-plugin", ++ "nemo-automodel-plugin", + ] + + # Legacy runtime needed specifically for task images that still invoke +@@ -443,6 +445,9 @@ nemo-auditor-plugin = { workspace = true } + nemo-switchyard = { workspace = true } + nemo-agents-plugin = { workspace = true } + builtin-nemo-agent = { workspace = true } ++nemo-customizer-plugin = { workspace = true } ++nemo-automodel-plugin = { workspace = true } ++nmp-automodel = { workspace = true } + + + [tool.uv.workspace] +@@ -493,6 +498,9 @@ members = [ + "plugins/nemo-switchyard", + "plugins/nemo-agents", + "plugins/nemo-agents/builtin_agents/nemo-agent", ++ "plugins/nemo-customizer", ++ "plugins/nemo-automodel", ++ "services/automodel", + ] + + +diff --git a/services/automodel/README.md b/services/automodel/README.md +new file mode 100644 +index 000000000..ae10b1370 +--- /dev/null ++++ b/services/automodel/README.md +@@ -0,0 +1,3 @@ ++# nmp-automodel ++ ++Compiler and task entrypoints for NeMo Automodel training jobs on the platform. **No HTTP server** — consumed by `nemo-automodel-plugin` and Jobs task images (`nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-tasks`, `.../nmp/automodel-training`). +diff --git a/services/automodel/docker/Dockerfile.mamba-wheel b/services/automodel/docker/Dockerfile.mamba-wheel +new file mode 100644 +index 000000000..357428358 +--- /dev/null ++++ b/services/automodel/docker/Dockerfile.mamba-wheel +@@ -0,0 +1,245 @@ ++# syntax=docker/dockerfile:1 ++####### ++# Mamba Wheel Builder ++# ++# Builds Python wheels for: ++# - causal-conv1d (CUDA extension required by mamba-ssm) ++# - mamba-ssm (selective state space model) ++# ++# Both only ship source distributions on PyPI and require nvcc to compile. ++# The two builds are independent stages so BuildKit runs them in parallel. ++# Each image stores its wheel at /wheels/*.whl. ++# Build via Platform bake group: docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels ++# ++# Build args: ++# CAUSAL_CONV1D_VERSION - git tag to build (default: v1.5.3) ++# MAMBA_22_COMMIT - git SHA or tag to build ++# CUDA_VERSION - CUDA devel image version (default: 12.8.1) ++# TORCH_CUDA_ARCH_LIST - semicolon-separated SM targets (default: "8.0;8.6;9.0") ++####### ++ ++ARG CUDA_VERSION=12.8.1 ++ ++# ============================================================================= ++# Shared base: CUDA + Python 3.11 + torch (required by extension builds) ++# ============================================================================= ++FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS mamba-wheel-base ++ ++ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0" ++ ++ENV DEBIAN_FRONTEND=noninteractive ++ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} ++ ++RUN apt-get update && apt-get install -y --no-install-recommends \ ++ build-essential \ ++ ca-certificates \ ++ git \ ++ python3.11 \ ++ python3.11-dev \ ++ python3.11-venv \ ++ && rm -rf /var/lib/apt/lists/* ++ ++COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /usr/local/bin/uv ++COPY services/automodel/docker/locks/mamba-wheel-build-py311 /opt/mamba-wheel-build-py311 ++ ++# Create the Python 3.11 build venv from a committed lockfile. ++RUN uv venv --python 3.11 /opt/venv && \ ++ UV_PROJECT_ENVIRONMENT=/opt/venv uv sync \ ++ --project /opt/mamba-wheel-build-py311 \ ++ --locked \ ++ --no-install-project && \ ++ echo "=== torch version (py311 wheel build) ===" && \ ++ /opt/venv/bin/python -c "import torch; print(f'torch={torch.__version__}, CUDA={torch.version.cuda}')" ++ ++ENV VIRTUAL_ENV=/opt/venv ++ENV PATH="/opt/venv/bin:$PATH" ++ ++# ============================================================================= ++# Python 3.12 base: extends mamba-wheel-base with Python 3.12 for cp312 wheels ++# ============================================================================= ++FROM mamba-wheel-base AS mamba-wheel-base-py312 ++ ++# Install Python 3.12 via deadsnakes PPA (Ubuntu 22.04 ships 3.10 by default) ++RUN apt-get update && apt-get install -y --no-install-recommends \ ++ software-properties-common \ ++ && add-apt-repository ppa:deadsnakes/ppa \ ++ && apt-get update && apt-get install -y --no-install-recommends \ ++ python3.12 \ ++ python3.12-dev \ ++ python3.12-venv \ ++ && apt-get clean \ ++ && rm -rf /var/lib/apt/lists/* ++ ++COPY services/automodel/docker/locks/mamba-wheel-build-py312 /opt/mamba-wheel-build-py312 ++ ++# Create the Python 3.12 build venv from a committed lockfile. ++RUN uv venv --python 3.12 /opt/venv312 && \ ++ UV_PROJECT_ENVIRONMENT=/opt/venv312 uv sync \ ++ --project /opt/mamba-wheel-build-py312 \ ++ --locked \ ++ --no-install-project && \ ++ echo "=== torch version (py312 wheel build) ===" && \ ++ /opt/venv312/bin/python -c "import torch; print(f'torch={torch.__version__}, CUDA={torch.version.cuda}')" ++ ++ENV VIRTUAL_ENV=/opt/venv312 ++ENV PATH="/opt/venv312/bin:$PATH" ++ ++# special builder for 13.1.1 cuda ++FROM nvcr.io/nvidia/pytorch:26.02-py3 AS mamba-wheel-base-py312-cu13.1.1 ++ ++ENV DEBIAN_FRONTEND=noninteractive ++ ++RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ ++ && apt-get clean \ ++ && rm -rf /var/lib/apt/lists/* ++ ++COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /usr/local/bin/uv ++ ++ ++# ============================================================================= ++# causal-conv1d wheel — Python 3.11 (for nmp-gpu-tasks and nmp-customizer-tasks) ++# ============================================================================= ++FROM mamba-wheel-base AS causal-conv1d-wheel-builder ++ ++ARG CAUSAL_CONV1D_VERSION=v1.5.3 ++ ++RUN mkdir -p /wheels && \ ++ git clone --depth 1 --branch ${CAUSAL_CONV1D_VERSION} \ ++ https://github.com/Dao-AILab/causal-conv1d.git /src/causal-conv1d && \ ++ cd /src/causal-conv1d && \ ++ CAUSAL_CONV1D_FORCE_BUILD=TRUE uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/causal-conv1d ++ ++# ============================================================================= ++# causal-conv1d wheel — Python 3.12 (for Python 3.12 consumers) ++# ============================================================================= ++FROM mamba-wheel-base-py312 AS causal-conv1d-wheel-builder-py312 ++ ++ARG CAUSAL_CONV1D_VERSION=v1.5.3 ++ ++RUN mkdir -p /wheels && \ ++ git clone --depth 1 --branch ${CAUSAL_CONV1D_VERSION} \ ++ https://github.com/Dao-AILab/causal-conv1d.git /src/causal-conv1d && \ ++ cd /src/causal-conv1d && \ ++ CAUSAL_CONV1D_FORCE_BUILD=TRUE uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/causal-conv1d ++ ++# ============================================================================= ++# causal-conv1d wheel — Python 3.12 (for Python 3.12 consumers) - Using CUDA 13.1.1 ++# ============================================================================= ++FROM mamba-wheel-base-py312-cu13.1.1 AS causal-conv1d-wheel-builder-py312-cu13.1.1 ++ ++ARG CAUSAL_CONV1D_VERSION=v1.5.3 ++ ++RUN mkdir -p /wheels && \ ++ git clone --depth 1 --branch ${CAUSAL_CONV1D_VERSION} \ ++ https://github.com/Dao-AILab/causal-conv1d.git /src/causal-conv1d && \ ++ cd /src/causal-conv1d && \ ++ CAUSAL_CONV1D_FORCE_BUILD=TRUE uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/causal-conv1d ++ ++# The final causal-conv1d-wheel image contains: ++# - causal_conv1d-*-cp311-*.whl (for Python 3.11 consumers: nmp-gpu-tasks, nmp-customizer-tasks) ++# - causal_conv1d-*-cp312-*.whl (for Python 3.12 consumers) ++# Consumers must pin the Python tag glob (e.g. causal_conv1d-*cp311*.whl) to select the right one. ++FROM scratch AS causal-conv1d-wheel ++COPY --from=causal-conv1d-wheel-builder /wheels /wheels ++COPY --from=causal-conv1d-wheel-builder-py312 /wheels /wheels ++COPY --from=causal-conv1d-wheel-builder-py312-cu13.1.1 /wheels /wheels/cu13.1.1 ++ ++# ============================================================================= ++# mamba-ssm 2.2.5 wheel — Python 3.11 (for nmp-gpu-tasks) ++# ============================================================================= ++FROM mamba-wheel-base AS mamba-ssm-wheel-builder ++ ++# post commit after 2.2.5 ++ARG MAMBA_22_COMMIT=6b32be06d026e170b3fdaf3ae6282c5a6ff57b06 ++ ++RUN mkdir -p /wheels && \ ++ git clone https://github.com/state-spaces/mamba.git /src/mamba && \ ++ cd /src/mamba && \ ++ git checkout ${MAMBA_22_COMMIT} && \ ++ sed -i "/triton/d" setup.py && \ ++ sed -i "/triton/d" pyproject.toml && \ ++ uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/mamba ++ ++# ============================================================================= ++# mamba-ssm 2.2.5 wheel — Python 3.12 (for Python 3.12 consumers, e.g. automodel) ++# ============================================================================= ++FROM mamba-wheel-base-py312 AS mamba-ssm-25-wheel-builder-py312 ++ ++# post commit after 2.2.5 ++ARG MAMBA_22_COMMIT=6b32be06d026e170b3fdaf3ae6282c5a6ff57b06 ++ ++RUN mkdir -p /wheels && \ ++ git clone https://github.com/state-spaces/mamba.git /src/mamba && \ ++ cd /src/mamba && \ ++ git checkout ${MAMBA_22_COMMIT} && \ ++ sed -i "/triton/d" setup.py && \ ++ sed -i "/triton/d" pyproject.toml && \ ++ uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/mamba ++ ++# ============================================================================= ++# mamba-ssm 2.3.0 wheel — Python 3.11 (for nmp-customizer-tasks) ++# ============================================================================= ++FROM mamba-wheel-base AS mamba-ssm-23-wheel-builder ++ ++ARG MAMBA_23_COMMIT=v2.3.0 ++ ++RUN mkdir -p /wheels && \ ++ git clone https://github.com/state-spaces/mamba.git /src/mamba && \ ++ cd /src/mamba && \ ++ git checkout ${MAMBA_23_COMMIT} && \ ++ sed -i "/triton/d" setup.py && \ ++ sed -i "/triton/d" pyproject.toml && \ ++ uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/mamba ++ ++# ============================================================================= ++# mamba-ssm 2.3.0 wheel — Python 3.12 (for Python 3.12 consumers) ++# ============================================================================= ++FROM mamba-wheel-base-py312 AS mamba-ssm-23-wheel-builder-py312 ++ ++ARG MAMBA_23_COMMIT=v2.3.0 ++ ++RUN mkdir -p /wheels && \ ++ git clone https://github.com/state-spaces/mamba.git /src/mamba && \ ++ cd /src/mamba && \ ++ git checkout ${MAMBA_23_COMMIT} && \ ++ sed -i "/triton/d" setup.py && \ ++ sed -i "/triton/d" pyproject.toml && \ ++ uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/mamba ++ ++ ++# ============================================================================= ++# mamba-ssm 2.3.0 wheel — Python 3.12 (for Python 3.12 consumers) - Using CUDA 13.1.1 ++# ============================================================================= ++FROM mamba-wheel-base-py312-cu13.1.1 AS mamba-ssm-23-wheel-builder-py312-cu13.1.1 ++ ++ARG MAMBA_23_COMMIT=v2.3.0 ++ ++RUN mkdir -p /wheels && \ ++ git clone https://github.com/state-spaces/mamba.git /src/mamba && \ ++ cd /src/mamba && \ ++ git checkout ${MAMBA_23_COMMIT} && \ ++ sed -i "/triton/d" setup.py && \ ++ sed -i "/triton/d" pyproject.toml && \ ++ uv build --wheel --no-build-isolation --out-dir=/wheels . && \ ++ rm -rf /src/mamba ++ ++ ++# The final mamba-ssm-wheel image contains four versions: ++# - mamba_ssm-2.2.5-cp311-*.whl (from MAMBA_22_COMMIT=6b32be06, for nmp-gpu-tasks / Python 3.11) ++# - mamba_ssm-2.2.5-cp312-*.whl (from MAMBA_22_COMMIT=6b32be06, for Python 3.12 consumers, e.g. automodel) ++# - mamba_ssm-2.3.0-cp311-*.whl (from v2.3.0, for nmp-customizer-tasks / Python 3.11) ++# - mamba_ssm-2.3.0-cp312-*.whl (from v2.3.0, for Python 3.12 consumers) ++# Consumers must pin both version AND Python tag glob to select the correct wheel. ++FROM scratch AS mamba-ssm-wheel ++COPY --from=mamba-ssm-wheel-builder /wheels /wheels ++COPY --from=mamba-ssm-25-wheel-builder-py312 /wheels /wheels ++COPY --from=mamba-ssm-23-wheel-builder /wheels /wheels ++COPY --from=mamba-ssm-23-wheel-builder-py312 /wheels /wheels ++COPY --from=mamba-ssm-23-wheel-builder-py312-cu13.1.1 /wheels /wheels/cu13.1.1 +diff --git a/services/automodel/docker/Dockerfile.nmp-automodel-base b/services/automodel/docker/Dockerfile.nmp-automodel-base +new file mode 100644 +index 000000000..a2f6e8ced +--- /dev/null ++++ b/services/automodel/docker/Dockerfile.nmp-automodel-base +@@ -0,0 +1,91 @@ ++# syntax=docker/dockerfile:1 ++# nmp-automodel base - PyTorch NGC image + Automodel + CUDA extension wheels. ++# ++# Mirrors nmp/docker/Dockerfile.nmp-customizer customizer-automodel-base-builder. ++# Publish target: nmp-automodel-base-builder (tags as nmp-automodel-base). ++ ++ARG CAUSAL_CONV1D_WHEEL_IMAGE=local ++ARG MAMBA_SSM_WHEEL_IMAGE=local ++ARG AUTOMODEL_COMMIT=0e9909f56ba48ef9761fc6f49323ba9d0a0835b2 ++ ++FROM ${CAUSAL_CONV1D_WHEEL_IMAGE} AS causal-conv1d-wheel-src ++FROM ${MAMBA_SSM_WHEEL_IMAGE} AS mamba-ssm-wheel-src ++ ++FROM alpine/git AS automodel-clone ++ARG AUTOMODEL_COMMIT ++RUN git clone --branch main https://github.com/NVIDIA-NeMo/Automodel.git /opt/Automodel && \ ++ cd /opt/Automodel && \ ++ git checkout ${AUTOMODEL_COMMIT} && \ ++ rm -rf /opt/Automodel/.git ++ ++FROM nvcr.io/nvidia/pytorch:26.02-py3 AS nmp-automodel-base-builder ++ ++WORKDIR /opt ++ ++COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /bin/uv ++RUN uv self update || true ++ ++ENV VIRTUAL_ENV=/opt/venv \ ++ UV_PROJECT_ENVIRONMENT=/opt/venv \ ++ UV_LINK_MODE=copy \ ++ UV_COMPILE_BYTECODE=1 ++ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" ++ ++RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages ++ ++COPY --from=automodel-clone /opt/Automodel /opt/Automodel ++COPY services/customizer/src/cherry-picks /opt/cherry-picks ++RUN cd /opt/Automodel && patch -p1 < /opt/cherry-picks/e6d2930a.diff ++ ++RUN cd /opt/Automodel && \ ++ bash docker/common/update_pyproject_pytorch.sh /opt/Automodel ++ ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ cd /opt/Automodel && \ ++ UV_HTTP_TIMEOUT=120 uv sync --locked --extra all --all-groups ++ ++# Install AFTER Automodel sync - uv sync drops packages not in its lockfile. ++RUN --mount=from=causal-conv1d-wheel-src,target=/tmp/causal-conv1d-wheel-src,readonly \ ++ --mount=from=mamba-ssm-wheel-src,target=/tmp/mamba-ssm-wheel-src,readonly \ ++ uv pip install --no-cache-dir --no-deps \ ++ /tmp/causal-conv1d-wheel-src/wheels/cu13.1.1/causal_conv1d-*cp312*.whl \ ++ /tmp/mamba-ssm-wheel-src/wheels/cu13.1.1/mamba_ssm-2.3.0-cp312*.whl ++ ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ uv pip install --no-build-isolation --no-deps git+https://github.com/fanshiqing/grouped_gemm@v1.1.4 ++ ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && \ ++ cd bitsandbytes && \ ++ git checkout 0.49.1 && \ ++ cmake -DCOMPUTE_CAPABILITY="75;80;86;87;89;90;100;103;110;120;121" -DCOMPUTE_BACKEND=cuda -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -S . && \ ++ make -j"$(nproc)" && \ ++ uv pip install scikit-build-core --no-deps && \ ++ uv pip install --no-build-isolation --no-deps --force-reinstall . && \ ++ uv pip uninstall scikit-build-core ++ ++RUN if [ -f /usr/local/bin/torchrun ]; then \ ++ sed -i '1c\#!/opt/venv/bin/python' /usr/local/bin/torchrun; \ ++ fi ++ ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ uv pip install "hf-transfer>=0.1.8,<0.2" ++ ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ rm -rf /opt/venv/lib/python*/site-packages/vllm \ ++ /opt/venv/lib/python*/site-packages/vllm-*.dist-info && \ ++ uv pip install \ ++ "black>=26.3.1" \ ++ "pyasn1>=0.6.3" \ ++ "onnx>=1.21.0" ++ ++# Published base image (same filesystem as builder). ++FROM nvcr.io/nvidia/pytorch:26.02-py3 AS nmp-automodel-base ++COPY --from=nmp-automodel-base-builder /opt/venv /opt/venv ++COPY --from=nmp-automodel-base-builder /opt/Automodel /opt/Automodel ++ ++ENV VIRTUAL_ENV=/opt/venv \ ++ UV_PROJECT_ENVIRONMENT=/opt/venv \ ++ HF_HUB_ENABLE_HF_TRANSFER=1 ++ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" ++WORKDIR /opt +diff --git a/services/automodel/docker/Dockerfile.nmp-automodel-tasks b/services/automodel/docker/Dockerfile.nmp-automodel-tasks +new file mode 100644 +index 000000000..e8ccda82b +--- /dev/null ++++ b/services/automodel/docker/Dockerfile.nmp-automodel-tasks +@@ -0,0 +1,49 @@ ++# syntax=docker/dockerfile:1 ++# nmp-automodel tasks - file_io, model_entity, and other platform task steps. ++# Built on nmp-automodel-base (GPU-capable; runs on CPU or GPU nodes). ++ ++ARG BASE_TAG_AUTOMODEL=local ++ARG BASE_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev ++ARG SMOKE_MARKER=smoke_nmp_automodel_tasks ++ ++FROM ${BASE_REGISTRY}/nmp/automodel-base:${BASE_TAG_AUTOMODEL} AS nmp-automodel-base ++ ++FROM nmp-automodel-base AS runtime ++ ++ARG USERNAME=ubuntu ++ARG USER_UID=1000 ++ARG USER_GID=1000 ++ ++ENV HF_HUB_ENABLE_HF_TRANSFER=1 \ ++ OTEL_PYTHON_EXCLUDED_URLS="health" ++ ++COPY --from=platform-workspace / /app ++WORKDIR /app ++ ++RUN mkdir -p /home/${USERNAME}/.cache && \ ++ chown -R ${USER_UID}:${USER_GID} /home/${USERNAME} /app/services/automodel ++ ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \ ++ -e /app/sdk/python/nemo-platform \ ++ -e /app/packages/nemo_platform_plugin \ ++ -e /app/packages/nmp_common \ ++ -e /app/packages/nemo_platform \ ++ -e /app/services/automodel ++ ++RUN uv pip install --python ${VIRTUAL_ENV}/bin/python --upgrade setuptools && \ ++ uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \ ++ "python-multipart>=0.0.22" ++ ++ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ++ENTRYPOINT ["nemo-platform"] ++CMD ["--help"] ++ ++USER ${USER_UID}:${USER_GID} ++ ++FROM runtime AS smoke-test ++ARG SMOKE_MARKER ++USER 0 ++COPY tests/smoke_gpu/ /smoke_test/ ++RUN uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache --reinstall pytest && \ ++ ${VIRTUAL_ENV}/bin/pytest /smoke_test/ -m ${SMOKE_MARKER} -v +diff --git a/services/automodel/docker/Dockerfile.nmp-automodel-training b/services/automodel/docker/Dockerfile.nmp-automodel-training +new file mode 100644 +index 000000000..61b10a6d3 +--- /dev/null ++++ b/services/automodel/docker/Dockerfile.nmp-automodel-training +@@ -0,0 +1,54 @@ ++# syntax=docker/dockerfile:1 ++# nmp-automodel training - GPU finetune step (nemo_automodel recipes + nmp-automodel package). ++# Same platform glue as tasks; separate image tag for the compiler training step. ++ ++ARG BASE_TAG_AUTOMODEL=local ++ARG BASE_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev ++ARG SMOKE_MARKER=smoke_nmp_automodel_training ++ ++FROM ${BASE_REGISTRY}/nmp/automodel-base:${BASE_TAG_AUTOMODEL} AS nmp-automodel-base ++ ++FROM nmp-automodel-base AS runtime ++ ++ARG USERNAME=ubuntu ++ARG USER_UID=1000 ++ARG USER_GID=1000 ++ ++ENV HF_HUB_ENABLE_HF_TRANSFER=1 \ ++ OTEL_PYTHON_EXCLUDED_URLS="health" ++ ++COPY --from=platform-workspace / /app ++WORKDIR /app ++ ++RUN mkdir -p /home/${USERNAME}/.cache && \ ++ chown -R ${USER_UID}:${USER_GID} /home/${USERNAME} /app/services/automodel ++ ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \ ++ -e /app/sdk/python/nemo-platform \ ++ -e /app/packages/nemo_platform_plugin \ ++ -e /app/packages/nmp_common \ ++ -e /app/packages/nemo_platform \ ++ -e /app/services/automodel ++ ++# Pin training backend requirement (also declared in Automodel lockfile). ++RUN --mount=type=cache,target=/root/.cache/uv \ ++ uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \ ++ -r /app/services/automodel/src/nmp/automodel/tasks/training/backends/requirements.txt ++ ++RUN uv pip install --python ${VIRTUAL_ENV}/bin/python --upgrade setuptools && \ ++ uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \ ++ "python-multipart>=0.0.22" ++ ++ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ++ENTRYPOINT ["nemo-platform"] ++CMD ["--help"] ++ ++USER ${USER_UID}:${USER_GID} ++ ++FROM runtime AS smoke-test ++ARG SMOKE_MARKER ++USER 0 ++COPY tests/smoke_gpu/ /smoke_test/ ++RUN uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache --reinstall pytest && \ ++ ${VIRTUAL_ENV}/bin/pytest /smoke_test/ -m ${SMOKE_MARKER} -v +diff --git a/services/automodel/docker/Dockerfile.platform-workspace b/services/automodel/docker/Dockerfile.platform-workspace +new file mode 100644 +index 000000000..76356df6c +--- /dev/null ++++ b/services/automodel/docker/Dockerfile.platform-workspace +@@ -0,0 +1,12 @@ ++# syntax=docker/dockerfile:1 ++# Minimal Platform workspace slice for nmp-automodel container installs. ++# Used as a named build context (platform-workspace). ++ ++FROM scratch AS platform-workspace ++COPY pyproject.toml uv.lock README.md ./ ++COPY packages/nmp_common packages/nmp_common ++COPY packages/nemo_platform packages/nemo_platform ++COPY packages/nemo_platform_plugin packages/nemo_platform_plugin ++COPY packages/nemo_platform_ext packages/nemo_platform_ext ++COPY sdk/python/nemo-platform sdk/python/nemo-platform ++COPY services/automodel services/automodel +diff --git a/services/automodel/docker/README.md b/services/automodel/docker/README.md +new file mode 100644 +index 000000000..29020cf77 +--- /dev/null ++++ b/services/automodel/docker/README.md +@@ -0,0 +1,86 @@ ++# nmp-automodel container images ++ ++Three images derived from the legacy `nmp` **customizer-automodel** base builder (not the full `customizer-automodel` HTTP service image). Published under **`nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-*`** to avoid colliding with the upstream NGC `nvcr.io/nvidia/nemo-automodel` training container. ++ ++| Image | Dockerfile | Role | ++|-------|------------|------| ++| `nmp/automodel-base` | `Dockerfile.nmp-automodel-base` | PyTorch 26.02 + Automodel + `mamba-ssm` / `causal-conv1d` wheels | ++| `nmp/automodel-tasks` | `Dockerfile.nmp-automodel-tasks` | Platform task glue (`file_io`, `model_entity`); GPU-capable base | ++| `nmp/automodel-training` | `Dockerfile.nmp-automodel-training` | Training step (`nmp.automodel.tasks.training`) | ++ ++Full references (default tag `local`): ++ ++- `nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-base:local` ++- `nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-tasks:local` ++- `nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-training:local` ++ ++Bake file: **`docker-bake.automodel.hcl`** at the Platform repo root (`context = "."`). Run all commands from the Platform repo root. ++ ++## `docker buildx bake --print` ++ ++`--print` only parses the HCL and prints JSON. A **0.0s FINISHED** result is normal — no image is built. Use it to verify targets, tags, and platforms before a real build. ++ ++## Prerequisites ++ ++1. **CUDA extension wheels** (`causal-conv1d-wheel`, `mamba-ssm-wheel`) - built from this directory or pulled from NGC. The wheel Dockerfile and uv locks live under `docker/locks/` (ported from `nmp`). ++ ++2. **Base image tag** - after building the base, set `BASE_TAG_AUTOMODEL` (or push to `BASE_REGISTRY`) before building tasks/training. ++ ++## Build wheels and push to NGC (from Platform root) ++ ++```bash ++cd /path/to/Platform ++ ++docker login nvcr.io ++ ++export WHEELS_TAG="$(git rev-parse --short HEAD)" ++# Bake variables (WHEELS_REGISTRY, WHEELS_TAG, IMAGE_REGISTRY) are overridden via env, not --set. ++# Example: ++# export WHEELS_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev ++# export IMAGE_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev ++ ++docker buildx bake --print -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels ++ ++docker buildx bake \ ++ -f docker-bake.automodel.hcl \ ++ nmp-automodel-gpu-wheels \ ++ --push \ ++ --set "*.platform=linux/amd64" ++``` ++ ++Override platform: `export BUILD_PLATFORM=linux/amd64` or `--set "*.platform=linux/amd64"`. ++ ++## Build automodel images (from Platform root) ++ ++```bash ++cd /path/to/Platform ++ ++export WHEELS_TAG="${WHEELS_TAG:-3fd6986ff173b598446ffac06d9be3f84b482495}" ++export BAKE_TAG="${WHEELS_TAG}" ++ ++docker buildx bake \ ++ -f docker-bake.automodel.hcl \ ++ nmp-automodel-base-builder \ ++ --push \ ++ --set "*.platform=linux/amd64" ++ ++docker buildx bake \ ++ -f docker-bake.automodel.hcl \ ++ nmp-automodel \ ++ --push \ ++ --set "*.platform=linux/amd64" ++``` ++ ++To use wheels already published without rebuilding, `export WHEELS_TAG=` and matching `BAKE_TAG`. ++ ++Override registry: `export WHEELS_REGISTRY=...` and `export IMAGE_REGISTRY=...` before bake. ++ ++## Runtime ++ ++Entrypoint is `nemo-platform`. Orchestration should pass `run task` with the task module, for example: ++ ++```bash ++nemo-platform run task --task nmp.automodel.tasks.file_io ++``` ++ ++The job compiler resolves `nmp/automodel-tasks` and `nmp/automodel-training` under `NMP_AUTOMODEL_IMAGE_REGISTRY` (default `nvcr.io/0921617854601259/nemo-platform-dev`). See `nmp.automodel.images`. +diff --git a/services/automodel/docker/docker-bake.hcl b/services/automodel/docker/docker-bake.hcl +new file mode 100644 +index 000000000..47cb2b0c4 +--- /dev/null ++++ b/services/automodel/docker/docker-bake.hcl +@@ -0,0 +1,4 @@ ++# Moved to Platform repo root (same pattern as nmp/docker-bake.hcl): ++# docker buildx bake -f docker-bake.automodel.hcl ++# ++# Context is "." (repo root when run from Platform/). Do not use ../../.. here. +diff --git a/services/automodel/docker/locks/README.md b/services/automodel/docker/locks/README.md +new file mode 100644 +index 000000000..1dbcfdc3c +--- /dev/null ++++ b/services/automodel/docker/locks/README.md +@@ -0,0 +1,11 @@ ++# Mamba / causal-conv1d wheel build locks ++ ++Copied from `nmp/docker/locks/` for building `causal-conv1d-wheel` and `mamba-ssm-wheel` images from the Platform repo (see `Dockerfile.mamba-wheel` and `docker-bake.automodel.hcl` group `nmp-automodel-gpu-wheels`). ++ ++To refresh locks after dependency changes: ++ ++```bash ++cd /path/to/Platform ++uv lock --project services/automodel/docker/locks/mamba-wheel-build-py311 --python 3.11 ++uv lock --project services/automodel/docker/locks/mamba-wheel-build-py312 --python 3.12 ++``` +diff --git a/services/automodel/docker/locks/mamba-wheel-build-py311/pyproject.toml b/services/automodel/docker/locks/mamba-wheel-build-py311/pyproject.toml +new file mode 100644 +index 000000000..275dc68f3 +--- /dev/null ++++ b/services/automodel/docker/locks/mamba-wheel-build-py311/pyproject.toml +@@ -0,0 +1,27 @@ ++[project] ++name = "mamba-wheel-build-py311" ++version = "0.0.0" ++requires-python = ">=3.11,<3.12" ++dependencies = [ ++ "packaging", ++ "setuptools", ++ "wheel", ++ "torch==2.10.0+cu128; sys_platform == 'linux'", ++] ++ ++[tool.uv] ++required-version = ">=0.9.14,<0.10.0" ++prerelease = "if-necessary-or-explicit" ++index-strategy = "unsafe-best-match" ++environments = [ ++ "sys_platform == 'linux' and platform_machine == 'x86_64'", ++ "sys_platform == 'linux' and platform_machine == 'aarch64'", ++] ++ ++[tool.uv.sources] ++torch = { index = "pytorch-cu128", marker = "sys_platform == 'linux'" } ++ ++[[tool.uv.index]] ++name = "pytorch-cu128" ++url = "https://download.pytorch.org/whl/cu128" ++explicit = true +diff --git a/services/automodel/docker/locks/mamba-wheel-build-py311/uv.lock b/services/automodel/docker/locks/mamba-wheel-build-py311/uv.lock +new file mode 100644 +index 000000000..686846739 +--- /dev/null ++++ b/services/automodel/docker/locks/mamba-wheel-build-py311/uv.lock +@@ -0,0 +1,355 @@ ++version = 1 ++revision = 3 ++requires-python = "==3.11.*" ++resolution-markers = [ ++ "platform_machine == 'x86_64' and sys_platform == 'linux'", ++ "platform_machine == 'aarch64' and sys_platform == 'linux'", ++] ++supported-markers = [ ++ "platform_machine == 'x86_64' and sys_platform == 'linux'", ++ "platform_machine == 'aarch64' and sys_platform == 'linux'", ++] ++ ++[[package]] ++name = "cuda-bindings" ++version = "12.9.4" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "cuda-pathfinder", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/a9/2b/ebcbb60aa6dba830474cd360c42e10282f7a343c0a1f58d24fbd3b7c2d77/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a6a429dc6c13148ff1e27c44f40a3dd23203823e637b87fd0854205195988306", size = 11840604, upload-time = "2025-10-21T14:51:34.565Z" }, ++ { url = "https://files.pythonhosted.org/packages/45/e7/b47792cc2d01c7e1d37c32402182524774dadd2d26339bd224e0e913832e/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c912a3d9e6b6651853eed8eed96d6800d69c08e94052c292fec3f282c5a817c9", size = 12210593, upload-time = "2025-10-21T14:51:36.574Z" }, ++] ++ ++[[package]] ++name = "cuda-pathfinder" ++version = "1.5.2" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/f2/f9/1b9b60a30fc463c14cdea7a77228131a0ccc89572e8df9cb86c9648271ab/cuda_pathfinder-1.5.2-py3-none-any.whl", hash = "sha256:0c5f160a7756c5b072723cbbd6d861e38917ef956c68150b02f0b6e9271c71fa", size = 49988, upload-time = "2026-04-06T23:01:05.17Z" }, ++] ++ ++[[package]] ++name = "filelock" ++version = "3.25.2" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694", size = 40480, upload-time = "2026-03-11T20:45:38.487Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" }, ++] ++ ++[[package]] ++name = "fsspec" ++version = "2026.2.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, ++] ++ ++[[package]] ++name = "jinja2" ++version = "3.1.6" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ++] ++ ++[[package]] ++name = "mamba-wheel-build-py311" ++version = "0.0.0" ++source = { virtual = "." } ++dependencies = [ ++ { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "torch", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "wheel", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++ ++[package.metadata] ++requires-dist = [ ++ { name = "packaging" }, ++ { name = "setuptools" }, ++ { name = "torch", marker = "sys_platform == 'linux'", specifier = "==2.10.0+cu128", index = "https://download.pytorch.org/whl/cu128" }, ++ { name = "wheel" }, ++] ++ ++[[package]] ++name = "markupsafe" ++version = "3.0.3" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, ++ { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, ++ { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, ++ { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, ++ { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, ++ { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, ++] ++ ++[[package]] ++name = "mpmath" ++version = "1.3.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, ++] ++ ++[[package]] ++name = "networkx" ++version = "3.6.1" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, ++] ++ ++[[package]] ++name = "nvidia-cublas-cu12" ++version = "12.8.4.1" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" }, ++ { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, ++] ++ ++[[package]] ++name = "nvidia-cuda-cupti-cu12" ++version = "12.8.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" }, ++ { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, ++] ++ ++[[package]] ++name = "nvidia-cuda-nvrtc-cu12" ++version = "12.8.93" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, ++ { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" }, ++] ++ ++[[package]] ++name = "nvidia-cuda-runtime-cu12" ++version = "12.8.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" }, ++ { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, ++] ++ ++[[package]] ++name = "nvidia-cudnn-cu12" ++version = "9.10.2.21" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, ++ { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, ++] ++ ++[[package]] ++name = "nvidia-cufft-cu12" ++version = "11.3.3.83" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, ++ { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, ++] ++ ++[[package]] ++name = "nvidia-cufile-cu12" ++version = "1.13.1.3" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, ++ { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" }, ++] ++ ++[[package]] ++name = "nvidia-curand-cu12" ++version = "10.3.9.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" }, ++ { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, ++] ++ ++[[package]] ++name = "nvidia-cusolver-cu12" ++version = "11.7.3.90" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, ++ { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, ++] ++ ++[[package]] ++name = "nvidia-cusparse-cu12" ++version = "12.5.8.93" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, ++ { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, ++] ++ ++[[package]] ++name = "nvidia-cusparselt-cu12" ++version = "0.7.1" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" }, ++ { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, ++] ++ ++[[package]] ++name = "nvidia-nccl-cu12" ++version = "2.27.5" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" }, ++ { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, ++] ++ ++[[package]] ++name = "nvidia-nvjitlink-cu12" ++version = "12.8.93" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, ++ { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" }, ++] ++ ++[[package]] ++name = "nvidia-nvshmem-cu12" ++version = "3.4.5" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15", size = 138936938, upload-time = "2025-09-06T00:32:05.589Z" }, ++ { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, ++] ++ ++[[package]] ++name = "nvidia-nvtx-cu12" ++version = "12.8.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" }, ++ { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, ++] ++ ++[[package]] ++name = "packaging" ++version = "26.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, ++] ++ ++[[package]] ++name = "setuptools" ++version = "82.0.1" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316, upload-time = "2026-03-09T12:47:17.221Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223, upload-time = "2026-03-09T12:47:15.026Z" }, ++] ++ ++[[package]] ++name = "sympy" ++version = "1.14.0" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "mpmath", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ++] ++ ++[[package]] ++name = "torch" ++version = "2.10.0+cu128" ++source = { registry = "https://download.pytorch.org/whl/cu128" } ++dependencies = [ ++ { name = "cuda-bindings", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "filelock", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cuda-cupti-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cuda-nvrtc-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cuda-runtime-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cudnn-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cufft-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cufile-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-curand-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusolver-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusparselt-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nccl-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvtx-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "triton", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:85ed7944655ea6fd69377692e9cbfd7bba28d99696ceae79985e7caa99cf0a95" }, ++ { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1d01ffaebf64715c0f507a39463149cb19e596ff702bd4bcf862601f2881dabc" }, ++] ++ ++[[package]] ++name = "triton" ++version = "3.6.0" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651", size = 176081190, upload-time = "2026-01-20T16:16:00.523Z" }, ++ { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" }, ++] ++ ++[[package]] ++name = "typing-extensions" ++version = "4.15.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ++] ++ ++[[package]] ++name = "wheel" ++version = "0.46.3" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++sdist = { url = "https://files.pythonhosted.org/packages/89/24/a2eb353a6edac9a0303977c4cb048134959dd2a51b48a269dfc9dde00c8a/wheel-0.46.3.tar.gz", hash = "sha256:e3e79874b07d776c40bd6033f8ddf76a7dad46a7b8aa1b2787a83083519a1803", size = 60605, upload-time = "2026-01-22T12:39:49.136Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/87/22/b76d483683216dde3d67cba61fb2444be8d5be289bf628c13fc0fd90e5f9/wheel-0.46.3-py3-none-any.whl", hash = "sha256:4b399d56c9d9338230118d705d9737a2a468ccca63d5e813e2a4fc7815d8bc4d", size = 30557, upload-time = "2026-01-22T12:39:48.099Z" }, ++] +diff --git a/services/automodel/docker/locks/mamba-wheel-build-py312/pyproject.toml b/services/automodel/docker/locks/mamba-wheel-build-py312/pyproject.toml +new file mode 100644 +index 000000000..49aecc29f +--- /dev/null ++++ b/services/automodel/docker/locks/mamba-wheel-build-py312/pyproject.toml +@@ -0,0 +1,27 @@ ++[project] ++name = "mamba-wheel-build-py312" ++version = "0.0.0" ++requires-python = ">=3.12,<3.13" ++dependencies = [ ++ "packaging", ++ "setuptools", ++ "wheel", ++ "torch==2.10.0+cu128; sys_platform == 'linux'", ++] ++ ++[tool.uv] ++required-version = ">=0.9.14,<0.10.0" ++prerelease = "if-necessary-or-explicit" ++index-strategy = "unsafe-best-match" ++environments = [ ++ "sys_platform == 'linux' and platform_machine == 'x86_64'", ++ "sys_platform == 'linux' and platform_machine == 'aarch64'", ++] ++ ++[tool.uv.sources] ++torch = { index = "pytorch-cu128", marker = "sys_platform == 'linux'" } ++ ++[[tool.uv.index]] ++name = "pytorch-cu128" ++url = "https://download.pytorch.org/whl/cu128" ++explicit = true +diff --git a/services/automodel/docker/locks/mamba-wheel-build-py312/uv.lock b/services/automodel/docker/locks/mamba-wheel-build-py312/uv.lock +new file mode 100644 +index 000000000..6d8bec6dd +--- /dev/null ++++ b/services/automodel/docker/locks/mamba-wheel-build-py312/uv.lock +@@ -0,0 +1,356 @@ ++version = 1 ++revision = 3 ++requires-python = "==3.12.*" ++resolution-markers = [ ++ "platform_machine == 'x86_64' and sys_platform == 'linux'", ++ "platform_machine == 'aarch64' and sys_platform == 'linux'", ++] ++supported-markers = [ ++ "platform_machine == 'x86_64' and sys_platform == 'linux'", ++ "platform_machine == 'aarch64' and sys_platform == 'linux'", ++] ++ ++[[package]] ++name = "cuda-bindings" ++version = "12.9.4" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "cuda-pathfinder", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/0c/c2/65bfd79292b8ff18be4dd7f7442cea37bcbc1a228c1886f1dea515c45b67/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:694ba35023846625ef471257e6b5a4bc8af690f961d197d77d34b1d1db393f56", size = 11760260, upload-time = "2025-10-21T14:51:40.79Z" }, ++ { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" }, ++] ++ ++[[package]] ++name = "cuda-pathfinder" ++version = "1.5.2" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/f2/f9/1b9b60a30fc463c14cdea7a77228131a0ccc89572e8df9cb86c9648271ab/cuda_pathfinder-1.5.2-py3-none-any.whl", hash = "sha256:0c5f160a7756c5b072723cbbd6d861e38917ef956c68150b02f0b6e9271c71fa", size = 49988, upload-time = "2026-04-06T23:01:05.17Z" }, ++] ++ ++[[package]] ++name = "filelock" ++version = "3.25.2" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694", size = 40480, upload-time = "2026-03-11T20:45:38.487Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" }, ++] ++ ++[[package]] ++name = "fsspec" ++version = "2026.2.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, ++] ++ ++[[package]] ++name = "jinja2" ++version = "3.1.6" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ++] ++ ++[[package]] ++name = "mamba-wheel-build-py312" ++version = "0.0.0" ++source = { virtual = "." } ++dependencies = [ ++ { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "torch", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "wheel", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++ ++[package.metadata] ++requires-dist = [ ++ { name = "packaging" }, ++ { name = "setuptools" }, ++ { name = "torch", marker = "sys_platform == 'linux'", specifier = "==2.10.0+cu128", index = "https://download.pytorch.org/whl/cu128" }, ++ { name = "wheel" }, ++] ++ ++[[package]] ++name = "markupsafe" ++version = "3.0.3" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, ++ { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, ++ { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, ++ { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, ++ { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, ++ { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, ++] ++ ++[[package]] ++name = "mpmath" ++version = "1.3.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, ++] ++ ++[[package]] ++name = "networkx" ++version = "3.6.1" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, ++] ++ ++[[package]] ++name = "nvidia-cublas-cu12" ++version = "12.8.4.1" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" }, ++ { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, ++] ++ ++[[package]] ++name = "nvidia-cuda-cupti-cu12" ++version = "12.8.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" }, ++ { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, ++] ++ ++[[package]] ++name = "nvidia-cuda-nvrtc-cu12" ++version = "12.8.93" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, ++ { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" }, ++] ++ ++[[package]] ++name = "nvidia-cuda-runtime-cu12" ++version = "12.8.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" }, ++ { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, ++] ++ ++[[package]] ++name = "nvidia-cudnn-cu12" ++version = "9.10.2.21" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, ++ { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, ++] ++ ++[[package]] ++name = "nvidia-cufft-cu12" ++version = "11.3.3.83" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, ++ { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, ++] ++ ++[[package]] ++name = "nvidia-cufile-cu12" ++version = "1.13.1.3" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, ++ { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" }, ++] ++ ++[[package]] ++name = "nvidia-curand-cu12" ++version = "10.3.9.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" }, ++ { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, ++] ++ ++[[package]] ++name = "nvidia-cusolver-cu12" ++version = "11.7.3.90" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, ++ { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, ++] ++ ++[[package]] ++name = "nvidia-cusparse-cu12" ++version = "12.5.8.93" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, ++ { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, ++] ++ ++[[package]] ++name = "nvidia-cusparselt-cu12" ++version = "0.7.1" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" }, ++ { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, ++] ++ ++[[package]] ++name = "nvidia-nccl-cu12" ++version = "2.27.5" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" }, ++ { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, ++] ++ ++[[package]] ++name = "nvidia-nvjitlink-cu12" ++version = "12.8.93" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, ++ { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" }, ++] ++ ++[[package]] ++name = "nvidia-nvshmem-cu12" ++version = "3.4.5" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15", size = 138936938, upload-time = "2025-09-06T00:32:05.589Z" }, ++ { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, ++] ++ ++[[package]] ++name = "nvidia-nvtx-cu12" ++version = "12.8.90" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" }, ++ { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, ++] ++ ++[[package]] ++name = "packaging" ++version = "26.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, ++] ++ ++[[package]] ++name = "setuptools" ++version = "82.0.1" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316, upload-time = "2026-03-09T12:47:17.221Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223, upload-time = "2026-03-09T12:47:15.026Z" }, ++] ++ ++[[package]] ++name = "sympy" ++version = "1.14.0" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "mpmath", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ++] ++ ++[[package]] ++name = "torch" ++version = "2.10.0+cu128" ++source = { registry = "https://download.pytorch.org/whl/cu128" } ++dependencies = [ ++ { name = "cuda-bindings", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "filelock", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cuda-cupti-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cuda-nvrtc-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cuda-runtime-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cudnn-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cufft-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cufile-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-curand-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusolver-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-cusparselt-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nccl-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "nvidia-nvtx-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "triton", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++ { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++wheels = [ ++ { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6f09cdf2415516be028ae82e6b985bcfc3eac37bc52ab401142689f6224516ca" }, ++ { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:628e89bd5110ced7debee2a57c69959725b7fbc64eab81a39dd70e46c7e28ba5" }, ++] ++ ++[[package]] ++name = "triton" ++version = "3.6.0" ++source = { registry = "https://pypi.org/simple" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" }, ++ { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, ++] ++ ++[[package]] ++name = "typing-extensions" ++version = "4.15.0" ++source = { registry = "https://pypi.org/simple" } ++sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ++] ++ ++[[package]] ++name = "wheel" ++version = "0.46.3" ++source = { registry = "https://pypi.org/simple" } ++dependencies = [ ++ { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ++] ++sdist = { url = "https://files.pythonhosted.org/packages/89/24/a2eb353a6edac9a0303977c4cb048134959dd2a51b48a269dfc9dde00c8a/wheel-0.46.3.tar.gz", hash = "sha256:e3e79874b07d776c40bd6033f8ddf76a7dad46a7b8aa1b2787a83083519a1803", size = 60605, upload-time = "2026-01-22T12:39:49.136Z" } ++wheels = [ ++ { url = "https://files.pythonhosted.org/packages/87/22/b76d483683216dde3d67cba61fb2444be8d5be289bf628c13fc0fd90e5f9/wheel-0.46.3-py3-none-any.whl", hash = "sha256:4b399d56c9d9338230118d705d9737a2a468ccca63d5e813e2a4fc7815d8bc4d", size = 30557, upload-time = "2026-01-22T12:39:48.099Z" }, ++] +diff --git a/services/automodel/pyproject.toml b/services/automodel/pyproject.toml +new file mode 100644 +index 000000000..4ef6031ab +--- /dev/null ++++ b/services/automodel/pyproject.toml +@@ -0,0 +1,44 @@ ++[project] ++name = "nmp-automodel" ++version = "0.1.0" ++description = "NeMo Automodel job compiler and platform tasks (no HTTP server)." ++readme = "README.md" ++requires-python = ">=3.11,<3.14" ++dependencies = [ ++ "nmp-common", ++ "nemo-platform", ++ "pydantic>=2.10.6", ++ "pydantic-settings>=2.6.1", ++ "httpx>=0.27.0", ++ "aiofiles>=24.1.0", ++] ++ ++[project.optional-dependencies] ++dev = [ ++ "pytest>=8.3.4", ++ "pytest-asyncio>=0.25.3", ++ "pytest-mock>=3.14.0", ++ "nmp-testing", ++] ++ ++[project.scripts] ++nmp-automodel-file-io = "nmp.automodel.tasks.file_io:run" ++nmp-automodel-training = "nmp.automodel.tasks.training.__main__:run" ++nmp-automodel-model-entity = "nmp.automodel.tasks.model_entity.__main__:run" ++ ++[build-system] ++requires = ["hatchling"] ++build-backend = "hatchling.build" ++ ++[tool.hatch.build.targets.wheel] ++packages = ["src/nmp"] ++ ++[tool.uv.sources] ++nmp-common = { workspace = true } ++nemo-platform = { workspace = true } ++nmp-testing = { workspace = true } ++ ++[tool.pytest.ini_options] ++asyncio_mode = "auto" ++pythonpath = ["src"] ++testpaths = ["tests"] +diff --git a/services/automodel/src/nmp/automodel/__init__.py b/services/automodel/src/nmp/automodel/__init__.py +new file mode 100644 +index 000000000..260655621 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/__init__.py +@@ -0,0 +1,4 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""NeMo Automodel platform compiler and tasks.""" +diff --git a/services/automodel/src/nmp/automodel/adapter.py b/services/automodel/src/nmp/automodel/adapter.py +new file mode 100644 +index 000000000..d2bf9543a +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/adapter.py +@@ -0,0 +1,132 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Convert plugin ``AutomodelJobOutput`` shape to legacy ``CustomizationJobOutput`` for the compiler.""" ++ ++from __future__ import annotations ++ ++from typing import Any, Literal ++ ++from nmp.common.api.common import SecretRef ++from nmp.automodel.api.v2.jobs.schemas import ( ++ CustomizationJobOutput, ++ DistillationTraining, ++ IntegrationParams, ++ LoRAParams, ++ OutputResponse, ++ ParallelismParams, ++ SFTTraining, ++ WandBParams, ++) ++ ++ ++def _map_finetuning_type(value: str) -> str: ++ if value == "all_weights": ++ return "all_weights" ++ if value == "lora_merged": ++ return "lora_merged" ++ return "lora" ++ ++ ++def _build_peft(training: dict[str, Any]) -> LoRAParams | None: ++ ft = training.get("finetuning_type", "lora") ++ if ft == "all_weights": ++ return None ++ lora = training.get("lora") or {} ++ return LoRAParams( ++ rank=lora.get("rank", 16), ++ alpha=lora.get("alpha", 32), ++ merge=ft == "lora_merged" or lora.get("merge", False), ++ target_modules=lora.get("target_modules"), ++ ) ++ ++ ++def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTraining: ++ training = spec["training"] ++ schedule = spec.get("schedule") or {} ++ batch = spec.get("batch") or {} ++ optimizer = spec.get("optimizer") or {} ++ parallelism = spec.get("parallelism") or {} ++ ++ common: dict[str, Any] = { ++ "peft": _build_peft(training), ++ "learning_rate": optimizer.get("learning_rate", 1e-4), ++ "weight_decay": optimizer.get("weight_decay", 0.01), ++ "warmup_steps": optimizer.get("warmup_steps", 0), ++ "epochs": schedule.get("epochs", 1), ++ "max_steps": schedule.get("max_steps"), ++ "val_check_interval": schedule.get("val_check_interval"), ++ "batch_size": batch.get("global_batch_size", 8), ++ "micro_batch_size": batch.get("micro_batch_size", 1), ++ "sequence_packing": batch.get("sequence_packing", False), ++ "max_seq_length": training.get("max_seq_length", 2048), ++ "seed": schedule.get("seed"), ++ "parallelism": ParallelismParams( ++ num_nodes=parallelism.get("num_nodes", 1), ++ num_gpus_per_node=parallelism.get("num_gpus_per_node", 1), ++ tensor_parallel_size=parallelism.get("tensor_parallel_size", 1), ++ pipeline_parallel_size=parallelism.get("pipeline_parallel_size", 1), ++ context_parallel_size=parallelism.get("context_parallel_size", 1), ++ expert_parallel_size=parallelism.get("expert_parallel_size"), ++ ), ++ "execution_profile": training.get("execution_profile"), ++ } ++ ++ training_type: Literal["sft", "distillation"] = training.get("training_type", "sft") ++ if training_type == "distillation": ++ return DistillationTraining( ++ **common, ++ teacher_model=training["teacher_model"], ++ teacher_precision=training.get("teacher_precision", "bf16"), ++ distillation_ratio=training.get("distillation_ratio", 0.5), ++ distillation_temperature=training.get("distillation_temperature", 1.0), ++ ) ++ return SFTTraining(**common) ++ ++ ++def _build_integrations(spec: dict[str, Any]) -> IntegrationParams | None: ++ raw = spec.get("integrations") ++ if not raw: ++ return None ++ wandb = raw.get("wandb") ++ wandb_params = None ++ if wandb: ++ secret = wandb.get("api_key_secret") ++ wandb_params = WandBParams( ++ project=wandb.get("project"), ++ api_key_secret=SecretRef(secret) if isinstance(secret, str) else secret, ++ ) ++ return IntegrationParams(wandb=wandb_params, mlflow=raw.get("mlflow")) ++ ++ ++def automodel_spec_to_compiler_output(spec: dict[str, Any] | Any) -> CustomizationJobOutput: ++ """Map simplified Automodel job output (plugin schema) to ``CustomizationJobOutput``.""" ++ if hasattr(spec, "model_dump"): ++ data = spec.model_dump(mode="python") ++ else: ++ data = dict(spec) ++ ++ dataset = data["dataset"] ++ training_uri = dataset["training"] if isinstance(dataset, dict) else dataset ++ ++ output = data["output"] ++ if isinstance(output, dict): ++ out_type = output.get("type", "model") ++ output_resp = OutputResponse( ++ name=output["name"], ++ type=out_type, ++ fileset=output["fileset"], ++ description=output.get("description"), ++ ) ++ else: ++ output_resp = output ++ ++ return CustomizationJobOutput( ++ name=data.get("name"), ++ model=data["model"], ++ dataset=training_uri, ++ training=_build_training_block(data), ++ integrations=_build_integrations(data), ++ deployment_config=None, ++ output=output_resp, ++ ) +diff --git a/services/automodel/src/nmp/automodel/api/__init__.py b/services/automodel/src/nmp/automodel/api/__init__.py +new file mode 100644 +index 000000000..e69de29bb +diff --git a/services/automodel/src/nmp/automodel/api/v2/__init__.py b/services/automodel/src/nmp/automodel/api/v2/__init__.py +new file mode 100644 +index 000000000..e69de29bb +diff --git a/services/automodel/src/nmp/automodel/api/v2/jobs/__init__.py b/services/automodel/src/nmp/automodel/api/v2/jobs/__init__.py +new file mode 100644 +index 000000000..e69de29bb +diff --git a/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py +new file mode 100644 +index 000000000..31a115816 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py +@@ -0,0 +1,639 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""API schemas for customization job endpoints.""" ++ ++from typing import Annotated, Any, Dict, Literal, Optional, Self, Union ++ ++from nmp.automodel.entities.validators import validate_fileset_uri ++from nmp.automodel.entities.values import FinetuningType, OutputNameType, Precision ++from nmp.common.api.common import SecretRef ++from nmp.common.entities.constants import ( ++ MAX_LENGTH_255, ++ REGEX_WORD_CHARACTER_DOT_DASH, ++) ++from pydantic import AfterValidator, BaseModel, ConfigDict, Discriminator, Field, model_validator ++ ++# Important!!! Do not import Pydantic models from this file into tasks. ++# Instead, duplicate models from this file into corresponding task module schemas.py. ++ ++ ++class ValidationError(ValueError): ++ """Raised when job input validation fails.""" ++ ++ pass ++ ++ ++# ============================================================ ++# Sub-Configurations ++# ============================================================ ++ ++ ++class QuantizationParams(BaseModel): ++ """Base model quantization for memory-efficient PEFT training. ++ ++ Supports two scenarios: ++ - Full-precision base model: quantized on-the-fly at load time ++ - Pre-quantized base model: loaded directly at the specified precision ++ ++ In both cases, base model weights are frozen and only the PEFT adapter ++ parameters are trained in full precision. ++ """ ++ ++ precision: Literal["4bit", "8bit"] = Field( ++ default="4bit", ++ description="Quantization precision. '4bit' (NF4) for maximum memory savings, " ++ "'8bit' (LLM.int8) for a balance of quality and memory.", ++ ) ++ ++ ++class _PEFTParams(BaseModel): ++ """Base configuration shared by all PEFT methods.""" ++ ++ # Quantization only makes sense with PEFT (quantized base weights are frozen, so you need trainable ++ # adapter parameters), which is why it lives here rather than on _TrainingBase. ++ quantization: Optional[QuantizationParams] = Field( ++ default=None, ++ description="Enable quantized training to reduce GPU memory. " ++ "If the base model is full-precision, it will be quantized at load time. " ++ "If the base model is already pre-quantized, this configures the expected precision. " ++ "The trained adapter remains full-precision.", ++ ) ++ ++ ++class LoRAParams(_PEFTParams): ++ """LoRA adapter configuration.""" ++ ++ type: Literal["lora"] = "lora" ++ ++ rank: int = Field( ++ default=8, ++ ge=1, ++ le=256, ++ description="LoRA rank (low-rank dimension). Higher values increase capacity but use more memory.", ++ ) ++ alpha: int = Field( ++ default=32, ++ ge=1, ++ description="LoRA alpha scaling factor. Common practice: alpha = 2-4x rank.", ++ ) ++ dropout: float = Field( ++ default=0.0, ++ ge=0.0, ++ le=1.0, ++ description="LoRA dropout probability for regularization.", ++ ) ++ target_modules: Optional[list[str]] = Field( ++ default=None, ++ description="Module name patterns to apply LoRA to (e.g., ['*.q_proj', '*.v_proj']). " ++ "If not set, applies to all '*proj' linear layers.", ++ ) ++ merge: bool = Field( ++ default=False, ++ description="Merge LoRA weights into base model after training. " ++ "Produces a full-weight checkpoint instead of an adapter.", ++ ) ++ use_dora: bool = Field( ++ default=False, ++ description="Enable DoRA (Weight-Decomposed Low-Rank Adaptation). " ++ "Decomposes weight updates into magnitude and direction components. " ++ "Can improve quality especially at low ranks, but adds training overhead.", ++ ) ++ ++ @model_validator(mode="after") ++ def _validate_unsupported_features(self) -> Self: ++ if self.quantization is not None: ++ raise ValueError("Quantized LoRA training is not yet supported.") ++ if self.use_dora: ++ raise ValueError("DoRA is not yet supported.") ++ return self ++ ++ ++# When a second PEFT method is added (e.g., IA3Config), change this to: ++# PeftMethod = Annotated[Union[LoRAParams, IA3Config], Discriminator("type")] ++PeftMethod = LoRAParams ++ ++ ++class ParallelismParams(BaseModel): ++ """Distributed training parallelism configuration. ++ ++ Most users only need num_gpus_per_node. Advanced users can configure ++ tensor/pipeline/context/expert parallelism for large models. ++ """ ++ ++ num_gpus_per_node: int = Field(default=1, gt=0, description="Number of gpus per node.") ++ num_nodes: int = Field(default=1, gt=0, description="Number of nodes.") ++ tensor_parallel_size: int = Field(default=1, gt=0, description="Tensor parallel size.") ++ pipeline_parallel_size: int = Field(default=1, gt=0, description="Pipeline parallel size.") ++ context_parallel_size: int = Field(default=1, gt=0, description="Context parallel size.") ++ expert_parallel_size: Optional[int] = Field(default=None, gt=0, description="Expert parallel size (MoE models).") ++ sequence_parallel: bool = Field(default=False, description="Enable sequence parallelism.") ++ ++ ++# ============================================================ ++# Training Method Discriminated Union ++# ============================================================ ++ ++ ++class _TrainingBase(BaseModel): ++ """Common training configuration shared by all methods. ++ ++ Flat hyperparameters match the ML practitioner mental model ++ (like HuggingFace TrainingArguments / TRL SFTConfig). ++ Only parallelism is grouped — it's enterprise infrastructure. ++ """ ++ ++ # --- PEFT (orthogonal to training method) --- ++ peft: Optional[PeftMethod] = Field( ++ default=None, ++ description="PEFT adapter configuration. If set, trains a parameter-efficient adapter. " ++ "If omitted, performs full-weight fine-tuning.", ++ ) ++ ++ # --- Optimizer --- ++ learning_rate: float = Field( ++ default=1e-4, ++ description="Peak learning rate. Optimal value will depend on training type and PEFT. " ++ "For SFT without LoRA, start with 5e-5. If using LoRA start with 1e-4. Lowering the value " ++ "can enable for slower, more precise training; Raising the value speeds up learning.", ++ ) ++ min_learning_rate: Optional[float] = Field( ++ default=None, ++ description="Minimum learning rate for cosine decay. Optional; used with learning rate schedules.", ++ ) ++ weight_decay: float = Field( ++ default=0.01, ++ description="Weight decay coefficient. Helps prevent overfitting.", ++ ) ++ adam_beta1: float = Field( ++ default=0.9, ++ description="Adam beta1 parameter. Adjust for optimizer tuning.", ++ ) ++ adam_beta2: float = Field( ++ default=0.999, ++ description="Adam beta2 parameter. Adjust for optimizer tuning.", ++ ) ++ warmup_steps: int = Field( ++ default=0, ++ ge=0, ++ description="Linear warmup steps. Recommended: 10% of total training steps for stable training.", ++ ) ++ optimizer: Optional[str] = Field(default=None, description="Optimizer name (e.g., 'adamw').") ++ ++ # --- Schedule --- ++ epochs: int = Field( ++ default=1, ++ gt=0, ++ description="Number of complete passes through the dataset. The ideal number of epochs depends " ++ "on the training method, the number of training samples, and size of the model. Start with 3 for " ++ "a reasonable value. Monitor the validation and training loss curves. If both are still " ++ "decreasing, you can increase this number.", ++ ) ++ max_steps: Optional[int] = Field( ++ default=None, ++ gt=0, ++ description="Max training steps. Overrides epochs if set.", ++ ) ++ log_every_n_steps: Optional[int] = Field( ++ default=None, ++ description="Logging frequency in steps. Controls how often training metrics are logged.", ++ ) ++ val_check_interval: Optional[float] = Field( ++ default=None, ++ description="Validation interval. Float <= 1.0 is fraction of epoch; > 1.0 is step count.", ++ ) ++ ++ # --- Batch --- ++ batch_size: int = Field( ++ default=32, ++ gt=0, ++ description="Global batch size across all GPUs. Higher = faster but more memory. If OOM, reduce this first.", ++ ) ++ micro_batch_size: int = Field( ++ default=1, ++ gt=0, ++ description="Per-GPU micro batch size. Keep small (1-2) for large models to avoid OOM.", ++ ) ++ sequence_packing: bool = Field( ++ default=False, ++ description="Enable sequence packing for efficiency. Can improve training speed.", ++ ) ++ ++ # --- Model --- ++ max_seq_length: int = Field( ++ default=2048, ++ gt=0, ++ description="Maximum token sequence length for training. Higher = more memory, longer training.", ++ ) ++ precision: Optional[Precision] = Field( ++ default=None, ++ description="Model precision for training. Auto-detected if unset.", ++ ) ++ seed: Optional[int] = Field( ++ default=None, ++ description="Random seed for reproducibility. Optional.", ++ ) ++ ++ # --- Enterprise Infrastucture --- ++ parallelism: ParallelismParams = Field(default_factory=ParallelismParams) ++ execution_profile: Optional[str] = Field( ++ default=None, ++ min_length=1, ++ description="Execution profile for the GPU training step. Maps to an operator-configured profile " ++ "(e.g., 'a100', 'high_priority'). If omitted, uses the service-level default.", ++ ) ++ ++ model_config = {"protected_namespaces": ()} ++ ++ @property ++ def finetuning_type(self) -> FinetuningType: ++ """Derived from peft config: presence → adapter type, absence → full-weight.""" ++ if self.peft is None: ++ return FinetuningType.ALL_WEIGHTS ++ if isinstance(self.peft, LoRAParams): ++ return FinetuningType.LORA_MERGED if self.peft.merge else FinetuningType.LORA ++ raise ValueError(f"Unknown PEFT type: {type(self.peft).__name__}") ++ ++ ++class SFTTraining(_TrainingBase): ++ """Supervised Fine-Tuning.""" ++ ++ type: Literal["sft"] = "sft" ++ ++ ++class DistillationTraining(_TrainingBase): ++ """Knowledge Distillation with a teacher model. ++ ++ Customizer's differentiator — not available in Unsloth. ++ Trains the student model to match the teacher's output distribution. ++ """ ++ ++ type: Literal["distillation"] = "distillation" ++ teacher_model: str = Field( ++ description="Teacher model URN (e.g., 'workspace/model-name'). " ++ "Must have the same vocabulary as the student model.", ++ ) ++ teacher_precision: Literal["bf16", "fp16", "fp32"] = Field( ++ default="bf16", ++ description="Precision for loading the frozen teacher model. " ++ "Lower precision reduces memory but may affect logit quality.", ++ ) ++ distillation_ratio: float = Field( ++ default=0.5, ++ ge=0.0, ++ le=1.0, ++ description="Balance between CE loss and KD loss. 0.0 = CE only, 1.0 = KD only.", ++ ) ++ distillation_temperature: float = Field( ++ default=1.0, ++ gt=0.0, ++ description="Softmax temperature for KD. Higher = softer probability distributions.", ++ ) ++ ++ ++class DPOTraining(_TrainingBase): ++ """Direct Preference Optimization.""" ++ ++ type: Literal["dpo"] = "dpo" ++ ref_policy_kl_penalty: float = Field( ++ default=0.05, ge=0.0, description="KL penalty coefficient (beta in DPO paper)." ++ ) ++ preference_average_log_probs: bool = Field( ++ default=False, description="Average log probabilities for preference loss calculation." ++ ) ++ sft_average_log_probs: bool = Field( ++ default=False, description="Average log probabilities for SFT regularization loss." ++ ) ++ preference_loss_weight: float = Field(default=1.0, ge=0.0, description="Weight for the preference (DPO) loss term.") ++ sft_loss_weight: float = Field( ++ default=0.0, ge=0.0, description="Weight for SFT regularization loss (0 = disabled)." ++ ) ++ max_grad_norm: float = Field(default=1.0, ge=0.0, description="Maximum gradient norm for clipping.") ++ ++ @model_validator(mode="after") ++ def _peft_not_yet_supported(self) -> Self: ++ if self.peft is not None: ++ raise ValueError( ++ "PEFT is not yet supported with DPO training. Use full-weight training by omitting the 'peft' field." ++ ) ++ return self ++ ++ ++AnyTraining = Union[SFTTraining, DistillationTraining, DPOTraining] ++TrainingMethod = Annotated[AnyTraining, Discriminator("type")] ++ ++ ++# ============================================================ ++# Integration Configs (unchanged) ++# ============================================================ ++ ++ ++class WandBParams(BaseModel): ++ """Weights & Biases integration configuration. ++ ++ To use W&B, provide an api_key_secret referencing a secret that contains ++ the WANDB_API_KEY value. Optionally provide base_url for self-hosted W&B servers. ++ """ ++ ++ project: Optional[str] = Field( ++ default=None, ++ description="W&B project name (groups related runs). Defaults to output.name if not set.", ++ ) ++ name: Optional[str] = Field( ++ default=None, ++ description="W&B run name. Defaults to job_id if not provided.", ++ ) ++ entity: Optional[str] = Field( ++ default=None, ++ description="W&B entity (team or username).", ++ ) ++ tags: Optional[list[str]] = Field( ++ default=None, ++ description="W&B tags for filtering runs.", ++ ) ++ notes: Optional[str] = Field( ++ default=None, ++ description="W&B notes/description for the run.", ++ ) ++ base_url: Optional[str] = Field( ++ default=None, ++ description="Base URL for self-hosted W&B server (e.g., 'https://wandb.mycompany.com'). " ++ "If not provided, uses the default W&B cloud service.", ++ ) ++ api_key_secret: SecretRef | None = Field( ++ default=None, ++ description="Reference to a secret containing the WANDB_API_KEY. " ++ "Format: 'secret_name' (uses request workspace) or 'workspace/secret_name' (explicit workspace).", ++ ) ++ ++ ++class MLflowParams(BaseModel): ++ """MLflow integration configuration.""" ++ ++ experiment_name: Optional[str] = Field( ++ default=None, ++ description="MLflow experiment name (groups related runs). Defaults to output.name if not set.", ++ ) ++ run_name: Optional[str] = Field( ++ default=None, ++ description="MLflow run name. Defaults to job_id if not provided.", ++ ) ++ tags: Optional[dict[str, str]] = Field( ++ default=None, ++ description="MLflow tags as key-value pairs for filtering runs.", ++ ) ++ description: Optional[str] = Field( ++ default=None, ++ description="MLflow run description.", ++ ) ++ tracking_uri: Optional[str] = Field( ++ default=None, ++ description="MLflow tracking server URI (e.g., 'http://mlflow.mycompany.com:5000'). " ++ "Can also be set via MLFLOW_TRACKING_URI environment variable.", ++ ) ++ ++ ++class IntegrationParams(BaseModel): ++ """Third-party integration configurations. ++ ++ Each integration type has its own optional field. To enable an integration, ++ provide its configuration object. Omit or set to None to disable. ++ """ ++ ++ wandb: Optional[WandBParams] = Field( ++ default=None, ++ description="Weights & Biases integration configuration.", ++ ) ++ mlflow: Optional[MLflowParams] = Field( ++ default=None, ++ description="MLflow integration configuration.", ++ ) ++ ++ ++# ============================================================ ++# Deployment Config ++# ============================================================ ++ ++ ++class ToolCallParams(BaseModel): ++ """Tool calling configuration for NIM deployments.""" ++ ++ tool_call_parser: Optional[str] = Field( ++ default=None, ++ description="Name of the tool call parser to use (e.g., 'openai', 'hermes', 'pythonic', 'llama3_json', 'mistral').", ++ ) ++ tool_call_plugin: Optional[str] = Field( ++ default=None, ++ pattern=r"^[\w\-.]+/[\w\-.]+$", ++ description="Reference to a fileset containing the custom tool call plugin Python file. " ++ "Expected format: '{workspace}/{fileset_name}'.", ++ ) ++ auto_tool_choice: Optional[bool] = Field( ++ default=None, ++ description="Whether to enable automatic tool choice.", ++ ) ++ ++ ++class DeploymentParams(BaseModel): ++ """Inline deployment parameters for creating a new ModelDeploymentConfig.""" ++ ++ gpu: int = Field( ++ default=1, ++ description="Number of GPUs required for the deployment", ++ ) ++ ++ additional_envs: Optional[dict[str, str]] = Field( ++ default=None, ++ description="Additional environment variables for the deployment", ++ ) ++ ++ disk_size: Optional[str] = Field( ++ default=None, ++ description="Disk size for the deployment", ++ ) ++ ++ image_name: Optional[str] = Field( ++ default=None, ++ description="Container image name from NGC. If not specified, defaults to multi-llm", ++ ) ++ ++ image_tag: Optional[str] = Field( ++ default=None, ++ description="Container image tag from NGC", ++ ) ++ ++ lora_enabled: bool = Field( ++ default=True, ++ description="When automatically deploying a full SFT training, this parameter being set to true will allow subsequent LoRA adapters to be trained and deployed against it.", ++ ) ++ ++ tool_call_config: Optional[ToolCallParams] = Field( ++ default=None, ++ description="Tool calling configuration override for the NIM deployment.", ++ ) ++ ++ ++# ============================================================ ++# Output ++# ============================================================ ++ ++ ++class _OutputBase(BaseModel): ++ """Shared fields for output artifact request and response.""" ++ ++ name: str = Field( ++ pattern=REGEX_WORD_CHARACTER_DOT_DASH, ++ max_length=MAX_LENGTH_255, ++ description="Name of the output artifact. Used to identify it during deployment and inference.", ++ examples=["my-finetuned-llama", "llama-3-8b-lora-v2"], ++ ) ++ ++ ++class OutputRequest(_OutputBase): ++ """Output artifact configuration provided by the user.""" ++ ++ ++class OutputResponse(_OutputBase): ++ """Resolved output artifact details returned by the server.""" ++ ++ type: OutputNameType = Field( ++ description="Output artifact type. Either `model` (full fine-tuned weights) or `adapter` (LoRA adapter weights).", ++ examples=["model", "adapter"], ++ ) ++ fileset: str = Field( ++ pattern=REGEX_WORD_CHARACTER_DOT_DASH, ++ max_length=MAX_LENGTH_255, ++ description="FileSet name where output artifacts are stored.", ++ examples=["my-model-a1b2c3d4e5f6"], ++ ) ++ ++ ++# ============================================================ ++# Job Schemas ++# ============================================================ ++ ++ ++class _CustomizationJobBase(BaseModel): ++ """Base schema with common fields for customization jobs.""" ++ ++ model: str = Field(description="Model reference (e.g., 'workspace/model-name').") ++ dataset: Annotated[str, AfterValidator(validate_fileset_uri)] = Field( ++ description="Training dataset fileset as 'workspace/name' or 'name' (resolved in the job path workspace)." ++ ) ++ training: TrainingMethod = Field(description="Training method and hyperparameters.") ++ integrations: Optional[IntegrationParams] = Field( ++ default=None, ++ description="Third-party integrations (e.g., Weights & Biases, MLflow).", ++ ) ++ deployment_config: Optional[str | DeploymentParams] = Field( ++ default=None, ++ description="Deployment configuration for auto-deploying the model after training. " ++ "Pass a string to reference an existing ModelDeploymentConfig by name " ++ "(e.g., 'my-config' or 'workspace/my-config'). " ++ "An object provides inline NIM deployment parameters. " ++ "Omit to skip deployment.", ++ ) ++ custom_fields: Dict[str, Any] = Field(default_factory=dict, description="Custom user-defined fields.") ++ ++ model_config = ConfigDict(protected_namespaces=(), regex_engine="python-re") ++ ++ ++class CustomizationJobInput(_CustomizationJobBase): ++ """Input schema for creating customization jobs.""" ++ ++ output: Optional[OutputRequest] = Field( ++ default=None, ++ description="Output artifact configuration. If omitted, name is auto-generated as " ++ "`{model}-{dataset}-`. The output type (model vs adapter) is always " ++ "inferred from the training configuration.", ++ examples=[{"name": "my-finetuned-llama"}], ++ ) ++ ++ @model_validator(mode="before") ++ @classmethod ++ def reject_legacy_fields(cls, data: object) -> object: ++ if isinstance(data, dict) and "output_model" in data: ++ raise ValueError("spec.output_model was removed. Use spec.output instead.") ++ return data ++ ++ @model_validator(mode="after") ++ def _reject_lora_without_lora_enabled(self) -> Self: ++ peft = self.training.peft ++ dc = self.deployment_config ++ if isinstance(peft, LoRAParams) and not peft.merge and isinstance(dc, DeploymentParams) and not dc.lora_enabled: ++ raise ValueError( ++ "deployment_config.lora_enabled must be true (or omitted) when training a LoRA adapter. " ++ "Setting lora_enabled=false would deploy the base model without LoRA support, " ++ "making the trained adapter unservable." ++ ) ++ return self ++ ++ ++class CustomizationJobOutput(_CustomizationJobBase): ++ """Customization job details returned by the server.""" ++ ++ output: OutputResponse = Field( ++ description="Output artifact created by this job.", ++ examples=[ ++ {"name": "my-finetuned-llama", "type": "model", "fileset": "my-finetuned-llama"}, ++ {"name": "llama-3-8b-lora-v2", "type": "adapter", "fileset": "llama-3-8b-lora-v2-a1b2c3d4e5f6"}, ++ ], ++ ) ++ ++ def validate_for_training(self) -> None: ++ """Validate this job input for training execution. ++ ++ Call this after any enrichment has been applied. ++ ++ Raises: ++ ValidationError: If validation fails. ++ """ ++ training = self.training ++ p = training.parallelism ++ num_nodes = p.num_nodes ++ num_gpus_per_node = p.num_gpus_per_node ++ tp = p.tensor_parallel_size ++ pp = p.pipeline_parallel_size ++ cp = p.context_parallel_size ++ ep = p.expert_parallel_size ++ ++ total_gpus = num_gpus_per_node * num_nodes ++ model_parallel_size = tp * pp * cp ++ if total_gpus % model_parallel_size != 0: ++ raise ValidationError( ++ f"Total GPUs ({total_gpus}) must be divisible by " ++ f"tensor_parallel_size ({tp}) * " ++ f"pipeline_parallel_size ({pp}) * " ++ f"context_parallel_size ({cp}) = {model_parallel_size}" ++ ) ++ ++ derived_dp = total_gpus // model_parallel_size ++ ++ # Note: Expert model parallelism (EP) is NOT a dimension that divides world_size like TP/PP. ++ # Instead, EP operates orthogonally, therefore we validate it separately. ++ # It distributes experts across the dp × cp dimension. ++ # FSDP2 requires: (dp_size × cp_size) % ep_size == 0 ++ if ep is not None: ++ dp_cp = derived_dp * cp ++ if dp_cp % ep != 0: ++ raise ValidationError( ++ f"(data_parallel_size * context_parallel_size) ({derived_dp} * {cp} = {dp_cp}) " ++ f"must be divisible by expert_parallel_size ({ep})" ++ ) ++ # MoE models on multi-GPU don't support tensor parallelism ++ # in Automodel's MoE parallelizer. See: nemo_automodel/components/moe/parallelizer.py ++ if ep > 1 and tp > 1 and total_gpus > 1: ++ raise ValidationError( ++ f"Tensor parallelism (tensor_parallel_size={tp}) is not supported for MoE models. " ++ f"When expert_parallel_size > 1 ({ep}), tensor_parallel_size must be 1." ++ ) ++ ++ gb = training.batch_size ++ mb = training.micro_batch_size ++ divisor = mb * derived_dp ++ if gb % divisor != 0: ++ raise ValidationError( ++ f"batch_size ({gb}) must be divisible by " ++ f"micro_batch_size ({mb}) * data_parallel_size ({derived_dp}) = {divisor}. " ++ f"Consider adjusting batch_size to {divisor * max(1, gb // divisor)} or {divisor * (gb // divisor + 1)}." ++ ) +diff --git a/services/automodel/src/nmp/automodel/app/__init__.py b/services/automodel/src/nmp/automodel/app/__init__.py +new file mode 100644 +index 000000000..35a0c9116 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/__init__.py +@@ -0,0 +1,4 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Customizer application module.""" +diff --git a/services/automodel/src/nmp/automodel/app/constants.py b/services/automodel/src/nmp/automodel/app/constants.py +new file mode 100644 +index 000000000..083498ada +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/constants.py +@@ -0,0 +1,173 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from nmp.common.jobs.constants import DEFAULT_JOB_STORAGE_PATH ++ ++SERVICE_NAME = "customizer" ++ ++# Global default seed for reproducibility ++DEFAULT_SEED = 1111 ++ ++# Relative directory names (used as subdirectory names under job storage) ++DEFAULT_MODEL_OUTPUT_DIR_NAME = "model" ++DEFAULT_DATASET_OUTPUT_DIR_NAME = "dataset" ++DEFAULT_TEACHER_MODEL_DIR_NAME = "teacher_model" ++DEFAULT_TRAINING_OUTPUT_DIR_NAME = "training" ++DEFAULT_OUTPUT_MODEL_DIR_NAME = "output_model" ++DEFAULT_TRAINING_RESULT_FILE_NAME = "customizer_training_result.json" ++ ++# Absolute paths (used in PlatformJobSpec for cross-step file sharing via PVC) ++DEFAULT_MODEL_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_MODEL_OUTPUT_DIR_NAME}" ++DEFAULT_DATASET_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_DATASET_OUTPUT_DIR_NAME}" ++DEFAULT_TEACHER_MODEL_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_TEACHER_MODEL_DIR_NAME}" ++DEFAULT_TRAINING_OUTPUT_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_TRAINING_OUTPUT_DIR_NAME}" ++DEFAULT_OUTPUT_MODEL_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_OUTPUT_MODEL_DIR_NAME}" ++ ++NMP_JOBS_URL_ENVVAR = "NMP_JOBS_URL" ++NMP_FILES_URL_ENVVAR = "NMP_FILES_URL" ++ ++# Models whose checkpoints require transformers-v4-compatible config.json output. ++# When v4_compatible is enabled, the original pretrained config.json is preserved ++# alongside a config.v5.json so downstream consumers (e.g. vLLM) that expect ++# a v4-format config continue to work. ++# using frozenset for faster lookup ++V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: frozenset[str] = frozenset( ++ { ++ "ApertusForCausalLM", ++ "ArceeForCausalLM", ++ "AriaTextForCausalLM", ++ "BambaForCausalLM", ++ "BartForCausalLM", ++ "BertLMHeadModel", ++ "BertGenerationDecoder", ++ "BigBirdForCausalLM", ++ "BigBirdPegasusForCausalLM", ++ "BioGptForCausalLM", ++ "BitNetForCausalLM", ++ "BlenderbotForCausalLM", ++ "BlenderbotSmallForCausalLM", ++ "BloomForCausalLM", ++ "BltForCausalLM", ++ "CamembertForCausalLM", ++ "LlamaForCausalLM", ++ "CodeGenForCausalLM", ++ "CohereForCausalLM", ++ "Cohere2ForCausalLM", ++ "CpmAntForCausalLM", ++ "CTRLLMHeadModel", ++ "Data2VecTextForCausalLM", ++ "DbrxForCausalLM", ++ "DeepseekV2ForCausalLM", ++ "DeepseekV3ForCausalLM", ++ "DiffLlamaForCausalLM", ++ "DogeForCausalLM", ++ "Dots1ForCausalLM", ++ "ElectraForCausalLM", ++ "Emu3ForCausalLM", ++ "ErnieForCausalLM", ++ "Ernie4_5ForCausalLM", ++ "Ernie4_5_MoeForCausalLM", ++ "Exaone4ForCausalLM", ++ "FalconForCausalLM", ++ "FalconH1ForCausalLM", ++ "FalconMambaForCausalLM", ++ "FlexOlmoForCausalLM", ++ "FuyuForCausalLM", ++ "GemmaForCausalLM", ++ "Gemma2ForCausalLM", ++ "Gemma3ForConditionalGeneration", ++ "Gemma3ForCausalLM", ++ "Gemma3nForConditionalGeneration", ++ "Gemma3nForCausalLM", ++ "GitForCausalLM", ++ "GlmForCausalLM", ++ "Glm4ForCausalLM", ++ "Glm4MoeForCausalLM", ++ "GotOcr2ForConditionalGeneration", ++ "GPT2LMHeadModel", ++ "GPTBigCodeForCausalLM", ++ "GPTNeoForCausalLM", ++ "GPTNeoXForCausalLM", ++ "GPTNeoXJapaneseForCausalLM", ++ "GptOssForCausalLM", ++ "GPTJForCausalLM", ++ "GraniteForCausalLM", ++ "GraniteMoeForCausalLM", ++ "GraniteMoeHybridForCausalLM", ++ "GraniteMoeSharedForCausalLM", ++ "HeliumForCausalLM", ++ "HunYuanDenseV1ForCausalLM", ++ "HunYuanMoEV1ForCausalLM", ++ "JambaForCausalLM", ++ "JetMoeForCausalLM", ++ "Lfm2ForCausalLM", ++ "Llama4ForCausalLM", ++ "LongcatFlashForCausalLM", ++ "MambaForCausalLM", ++ "Mamba2ForCausalLM", ++ "MarianForCausalLM", ++ "MBartForCausalLM", ++ "MegaForCausalLM", ++ "MegatronBertForCausalLM", ++ "MiniMaxForCausalLM", ++ "MinistralForCausalLM", ++ "MistralForCausalLM", ++ "MixtralForCausalLM", ++ "MllamaForCausalLM", ++ "ModernBertDecoderForCausalLM", ++ "MoshiForCausalLM", ++ "MptForCausalLM", ++ "MusicgenForCausalLM", ++ "MusicgenMelodyForCausalLM", ++ "MvpForCausalLM", ++ "NemotronForCausalLM", ++ "OlmoForCausalLM", ++ "Olmo2ForCausalLM", ++ "Olmo3ForCausalLM", ++ "OlmoeForCausalLM", ++ "OpenLlamaForCausalLM", ++ "OpenAIGPTLMHeadModel", ++ "OPTForCausalLM", ++ "PegasusForCausalLM", ++ "PersimmonForCausalLM", ++ "PhiForCausalLM", ++ "Phi3ForCausalLM", ++ "Phi4MultimodalForCausalLM", ++ "PhimoeForCausalLM", ++ "PLBartForCausalLM", ++ "ProphetNetForCausalLM", ++ "QDQBertLMHeadModel", ++ "Qwen2ForCausalLM", ++ "Qwen2MoeForCausalLM", ++ "Qwen3ForCausalLM", ++ "Qwen3MoeForCausalLM", ++ "Qwen3NextForCausalLM", ++ "RecurrentGemmaForCausalLM", ++ "ReformerModelWithLMHead", ++ "RemBertForCausalLM", ++ "RobertaForCausalLM", ++ "RobertaPreLayerNormForCausalLM", ++ "RoCBertForCausalLM", ++ "RoFormerForCausalLM", ++ "RwkvForCausalLM", ++ "SeedOssForCausalLM", ++ "SmolLM3ForCausalLM", ++ "Speech2Text2ForCausalLM", ++ "StableLmForCausalLM", ++ "Starcoder2ForCausalLM", ++ "TransfoXLLMHeadModel", ++ "TrOCRForCausalLM", ++ "VaultGemmaForCausalLM", ++ "WhisperForCausalLM", ++ "XGLMForCausalLM", ++ "XLMWithLMHeadModel", ++ "XLMProphetNetForCausalLM", ++ "XLMRobertaForCausalLM", ++ "XLMRobertaXLForCausalLM", ++ "XLNetLMHeadModel", ++ "xLSTMForCausalLM", ++ "XmodForCausalLM", ++ "ZambaForCausalLM", ++ "Zamba2ForCausalLM", ++ } ++) +diff --git a/services/automodel/src/nmp/automodel/app/jobs/__init__.py b/services/automodel/src/nmp/automodel/app/jobs/__init__.py +new file mode 100644 +index 000000000..e5725ea5a +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/__init__.py +@@ -0,0 +1,2 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 +diff --git a/services/automodel/src/nmp/automodel/app/jobs/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/compiler.py +new file mode 100644 +index 000000000..39bb0b549 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/compiler.py +@@ -0,0 +1,488 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Job compiler - transforms CustomizationJobOutput into PlatformJobSpec.""" ++ ++import logging ++ ++from nemo_platform import AsyncNeMoPlatform, NotFoundError ++from nemo_platform.types.models.model_entity import ModelEntity ++from nmp.common.auth import AuthClient, auth_client_context ++from nmp.common.entities import EntityClient ++from nmp.common.entities.utils import parse_entity_ref ++from nmp.common.jobs.api_factory import ( ++ ContainerSpec, ++ CPUExecutionProviderSpec, ++ EnvironmentVariable, ++ PlatformJobSpec, ++ PlatformJobStep, ++ ResourcesLimitsSpec, ++ ResourcesRequestsSpec, ++ ResourcesSpec, ++) ++from nmp.common.jobs.constants import DEFAULT_JOB_STORAGE_PATH, PERSISTENT_JOB_STORAGE_PATH_ENVVAR ++from nmp.common.jobs.exceptions import PlatformJobCompilationError ++from nmp.automodel.images import get_tasks_image ++from nmp.automodel.api.v2.jobs.schemas import ( ++ CustomizationJobInput, ++ CustomizationJobOutput, ++ DeploymentParams, ++ DistillationTraining, ++ LoRAParams, ++ ValidationError, ++) ++from nmp.automodel.app.constants import ( ++ DEFAULT_DATASET_PATH, ++ DEFAULT_MODEL_PATH, ++ DEFAULT_OUTPUT_MODEL_PATH, ++ DEFAULT_TEACHER_MODEL_PATH, ++) ++from nmp.automodel.app.jobs.file_io.schemas import ( ++ DownloadItem, ++ FileIOTaskConfig, ++ FileSetRef, ++ UploadItem, ++) ++from nmp.automodel.app.jobs.model_entity.schemas import ( ++ DeploymentParameters as ModelEntityDeploymentParameters, ++) ++from nmp.automodel.app.jobs.model_entity.schemas import ( ++ ModelEntityTaskConfig, ++) ++from nmp.automodel.app.jobs.model_entity.schemas import ( ++ PEFTConfig as ModelEntityPEFTConfig, ++) ++from nmp.automodel.app.jobs.training.compiler import ( ++ _extract_model_name, ++ _resolve_is_embedding_model, ++ compile_training_step, ++) ++from nmp.automodel.config import config ++from nmp.automodel.entities.values import FinetuningType ++from nmp.automodel.platform_client import fetch_model_entity ++ ++logger = logging.getLogger(__name__) ++ ++def _get_cpu_resources() -> ResourcesSpec: ++ """Get default CPU resources for download/upload tasks.""" ++ return ResourcesSpec( ++ limits=ResourcesLimitsSpec( ++ cpu=config.default_job_resource_cpu_limit, ++ memory=config.default_job_resource_memory_limit, ++ ), ++ requests=ResourcesRequestsSpec( ++ cpu=config.default_job_resource_cpu_request, ++ memory=config.default_job_resource_memory_request, ++ ), ++ ) ++ ++ ++def _get_base_environment() -> list[EnvironmentVariable]: ++ """Get base environment variables for all tasks.""" ++ return [ ++ EnvironmentVariable( ++ name=PERSISTENT_JOB_STORAGE_PATH_ENVVAR, ++ value=DEFAULT_JOB_STORAGE_PATH, ++ ), ++ ] ++ ++ ++def _extract_model_uri(me: ModelEntity) -> str | None: ++ """Extract model_uri from the model entity. ++ ++ Args: ++ me: The model entity. ++ ++ Returns: ++ The fileset string if available, None otherwise. ++ """ ++ return me.fileset if me.fileset else None ++ ++ ++def _append_download_if_present( ++ downloads: list[DownloadItem], ++ fileset_name: str | None, ++ dest: str, ++ field_name: str, ++) -> None: ++ """Append a download item if a FileSet ref is present.""" ++ if not fileset_name: ++ return ++ fileset = FileSetRef.model_validate(fileset_name) ++ downloads.append(DownloadItem(src=fileset, dest=dest)) ++ logger.info(f"Detected {field_name} FileSet reference: {fileset}") ++ ++ ++def _build_file_download_config( ++ job_spec: CustomizationJobOutput, ++ me: ModelEntity, ++ teacher_me: ModelEntity | None = None, ++) -> FileIOTaskConfig: ++ """Build the configuration for the file_io task. ++ ++ Extracts FileSet references from model_uri and dataset fields. ++ Fileset refs use workspace/name or name (optional legacy fileset:// prefix is stripped). ++ ++ Args: ++ job_spec: The customization job output specification. ++ me: The model entity being trained. ++ teacher_me: Optional teacher model entity for knowledge distillation jobs. ++ ++ Returns: ++ FileIOTaskConfig with download items for any fileset refs found. ++ ++ """ ++ downloads: list[DownloadItem] = [] ++ ++ model_fileset: str | None = _extract_model_uri(me) ++ _append_download_if_present( ++ downloads, ++ fileset_name=model_fileset, ++ dest=DEFAULT_MODEL_PATH, ++ field_name="model", ++ ) ++ _append_download_if_present( ++ downloads, ++ fileset_name=job_spec.dataset, ++ dest=DEFAULT_DATASET_PATH, ++ field_name="dataset", ++ ) ++ ++ if teacher_me is not None: ++ teacher_fileset: str | None = _extract_model_uri(teacher_me) ++ _append_download_if_present( ++ downloads, ++ fileset_name=teacher_fileset, ++ dest=DEFAULT_TEACHER_MODEL_PATH, ++ field_name="teacher_model", ++ ) ++ ++ return FileIOTaskConfig(download=downloads) ++ ++ ++def _build_output_fileset_metadata(me: ModelEntity) -> dict | None: ++ """Build tool_calling metadata to propagate to the output fileset. ++ ++ Extracts chat_template and tool_call_config from the source model entity's spec ++ so the model-spec-runner will apply them to the output model entity. ++ ++ Returns: ++ A dict like {"tool_calling": {...}} suitable for fileset metadata, or None ++ if there is nothing to propagate. ++ """ ++ if me.spec is None: ++ return None ++ ++ tool_calling: dict = {} ++ ++ if me.spec.chat_template: ++ tool_calling["chat_template"] = me.spec.chat_template ++ ++ if me.spec.tool_call_config: ++ tcc = me.spec.tool_call_config ++ if tcc.tool_call_parser: ++ tool_calling["tool_call_parser"] = tcc.tool_call_parser ++ if tcc.tool_call_plugin: ++ tool_calling["tool_call_plugin"] = tcc.tool_call_plugin ++ if tcc.auto_tool_choice is not None: ++ tool_calling["auto_tool_choice"] = tcc.auto_tool_choice ++ ++ return {"tool_calling": tool_calling} if tool_calling else None ++ ++ ++def _build_file_upload_config( ++ output_fileset_name: str, ++ fileset_metadata: dict | None = None, ++) -> FileIOTaskConfig: ++ """Build the configuration for the file_io upload task with a generated fileset name. ++ ++ The fileset name is generated at compile time and will be combined with ++ the job's workspace at runtime to form the full FileSet reference. ++ ++ Args: ++ output_fileset_name: The generated name for the output FileSet. ++ fileset_metadata: Optional metadata to set on the output fileset (e.g., tool_calling ++ config propagated from the source model entity). ++ ++ Returns: ++ FileIOTaskConfig with upload items configured to use the generated name. ++ """ ++ return FileIOTaskConfig( ++ upload=[ ++ UploadItem( ++ src=DEFAULT_OUTPUT_MODEL_PATH, ++ # workspace is None because at this layer, we don't know the job's workspace. ++ dest=FileSetRef(workspace=None, name=output_fileset_name), ++ metadata=fileset_metadata, ++ ) ++ ], ++ ) ++ ++ ++def _build_model_entity_config( ++ workspace: str, job_spec: CustomizationJobOutput, trust_remote_code: bool = False ++) -> ModelEntityTaskConfig: ++ """Build the configuration for the model_entity task. ++ ++ Args: ++ workspace: The workspace for this job. ++ job_spec: The customization job input specification. ++ trust_remote_code: Whether to trust remote code for the checkpoint. ++ ++ Returns: ++ ModelEntityTaskConfig with model entity creation settings. ++ """ ++ base_model = _extract_model_name(job_spec) ++ ++ assert job_spec.output is not None, "output must be set by input-to-output transformer" ++ training = job_spec.training ++ ++ peft_config: ModelEntityPEFTConfig | None = None ++ if isinstance(training.peft, LoRAParams): ++ peft_config = ModelEntityPEFTConfig( ++ type=training.finetuning_type, ++ alpha=training.peft.alpha, ++ rank=training.peft.rank, ++ ) ++ ++ # Only forward the user-supplied deployment_config from the job spec. ++ # tool_call_config from the *source* model entity's spec is propagated ++ # separately via fileset metadata (see _build_output_fileset_metadata), ++ # so we intentionally do not merge it here. ++ deployment_config: str | ModelEntityDeploymentParameters | None = None ++ if isinstance(job_spec.deployment_config, str): ++ deployment_config = job_spec.deployment_config ++ elif job_spec.deployment_config is not None: ++ deployment_config = ModelEntityDeploymentParameters.model_validate(job_spec.deployment_config.model_dump()) ++ ++ return ModelEntityTaskConfig( ++ name=job_spec.output.name, ++ workspace=workspace, ++ description="Customized model from job", ++ fileset=FileSetRef( ++ workspace=None, ++ name=job_spec.output.fileset, ++ ), ++ base_model=base_model, ++ model_entity=job_spec.model, ++ peft=peft_config, ++ trust_remote_code=trust_remote_code, ++ deployment_config=deployment_config, ++ ) ++ ++ ++async def _resolve_deployment_config_ref( ++ config_ref: str, ++ workspace: str, ++ sdk: AsyncNeMoPlatform, ++): ++ """Resolve a ``name`` or ``workspace/name`` string to a ModelDeploymentConfig.""" ++ ref = parse_entity_ref(config_ref, default_workspace=workspace) ++ try: ++ return await sdk.inference.deployment_configs.retrieve(name=ref.name, workspace=ref.workspace) ++ except NotFoundError as e: ++ raise PlatformJobCompilationError( ++ f"deployment_config references '{config_ref}' which does not exist in workspace '{ref.workspace}'." ++ ) from e ++ except Exception as e: ++ raise PlatformJobCompilationError(f"Failed to resolve deployment_config '{config_ref}': {e}") from e ++ ++ ++async def _validate_deployment_config( ++ workspace: str, ++ transformed_spec: CustomizationJobOutput, ++ sdk: AsyncNeMoPlatform, ++ auth_client: AuthClient, ++) -> None: ++ """Validate deployment_config consistency before training starts. ++ ++ Catches contradictory or impossible configurations early so the user ++ gets a clear error instead of a silent failure after expensive training. ++ """ ++ dc = transformed_spec.deployment_config ++ if dc is None: ++ return ++ ++ # Inline deployment params: check permission-gated fields. ++ if isinstance(dc, DeploymentParams): ++ tcc = dc.tool_call_config ++ if tcc and tcc.tool_call_plugin: ++ if not await auth_client.has_permissions(workspace, ["models.tool-call-plugin.set"]): ++ raise PlatformJobCompilationError( ++ "Insufficient permissions to set tool_call_plugin. " ++ "Requires the models.tool-call-plugin.set permission." ++ ) ++ return ++ ++ # String reference to an existing deployment config: validate consistency. ++ if not isinstance(dc, str): ++ return ++ ++ ft_type = transformed_spec.training.finetuning_type ++ is_lora = ft_type == FinetuningType.LORA ++ produces_new_model = ft_type in (FinetuningType.ALL_WEIGHTS, FinetuningType.LORA_MERGED) ++ resolved_config = await _resolve_deployment_config_ref(dc, workspace, sdk) ++ ++ # LoRA job referencing a config that has lora_enabled=False ++ if is_lora and resolved_config.nim_deployment and resolved_config.nim_deployment.lora_enabled is False: ++ raise PlatformJobCompilationError( ++ f"deployment_config references '{dc}' which has lora_enabled=false, " ++ "but this is a LoRA training job. The deployment would not load LoRA adapters. " ++ "Use a deployment config with lora_enabled=true, or provide inline deployment parameters." ++ ) ++ ++ # SFT or lora_merged referencing a string config ++ if produces_new_model: ++ output_name = transformed_spec.output.name ++ try: ++ existing_me = await sdk.models.retrieve(name=output_name, workspace=workspace) ++ except NotFoundError: ++ # Output model entity doesn't exist yet, so a string ++ # ref is inherently invalid -- it was created for a different model. ++ raise PlatformJobCompilationError( ++ f"deployment_config cannot be a string reference ('{dc}') for {ft_type.value} training " ++ "that creates a new model entity. The referenced config was created for a different model. " ++ "Use inline deployment parameters (e.g., DeploymentParams(gpu=1, lora_enabled=True)) instead." ++ ) ++ ++ # Output model entity already exists (retraining to create a new FileSet). ++ # Verify the config actually targets this model entity. ++ nim = resolved_config.nim_deployment ++ config_targets_model = (resolved_config.model_entity_id == f"{existing_me.workspace}/{existing_me.name}") or ( ++ nim and nim.model_name == existing_me.name and nim.model_namespace == existing_me.workspace ++ ) ++ if not config_targets_model: ++ raise PlatformJobCompilationError( ++ f"deployment_config references '{dc}' which targets a different model entity " ++ f"than the output model '{existing_me.workspace}/{existing_me.name}'. " ++ "The deployment config must target the same model entity being retrained, " ++ "or use inline deployment parameters instead." ++ ) ++ ++ ++async def platform_job_config_compiler( ++ workspace: str, ++ job_spec: CustomizationJobOutput, ++ sdk: AsyncNeMoPlatform, ++ entity_client: EntityClient | None = None, ++ job_name: str | None = None, ++ original_spec: CustomizationJobInput | None = None, ++ profile: str | None = None, ++) -> PlatformJobSpec: ++ """Compile canonical job spec into a four-step PlatformJobSpec.""" ++ del entity_client, job_name, original_spec, profile ++ transformed_spec = job_spec ++ logger.info("Compiling Automodel job to PlatformJobSpec: %s", transformed_spec.model_dump_json(indent=2)) ++ ++ try: ++ transformed_spec.validate_for_training() ++ except ValidationError as e: ++ raise PlatformJobCompilationError(str(e)) from e ++ ++ # output is a required field in CustomizationJobOutput ++ cpu_resources = _get_cpu_resources() ++ base_env = _get_base_environment() ++ ++ # Fetch the primary model entity ++ me = await fetch_model_entity(transformed_spec.model, workspace, sdk) ++ ++ # For distillation jobs, also fetch the teacher model entity ++ teacher_me: ModelEntity | None = None ++ if isinstance(transformed_spec.training, DistillationTraining): ++ try: ++ teacher_me = await fetch_model_entity(transformed_spec.training.teacher_model, workspace, sdk) ++ except ValueError as e: ++ raise PlatformJobCompilationError( ++ f"Teacher model '{transformed_spec.training.teacher_model}' not found. " ++ "Verify the teacher model entity exists." ++ ) from e ++ except PermissionError as e: ++ raise PlatformJobCompilationError( ++ f"Access denied to teacher model '{transformed_spec.training.teacher_model}'." ++ ) from e ++ ++ if transformed_spec.deployment_config is not None: ++ auth_client = auth_client_context.get() ++ if auth_client is None: ++ raise PlatformJobCompilationError( ++ "No auth context available; cannot validate deployment config permissions.", ++ ) ++ await _validate_deployment_config(workspace, transformed_spec, sdk, auth_client) ++ ++ file_io_download_config = _build_file_download_config(transformed_spec, me, teacher_me) ++ is_embedding_model_flag = _resolve_is_embedding_model(me) ++ ++ # The embedding NIM requires ONNX format, which cannot represent standalone LoRA adapters. ++ # LoRA with merge=True (lora_merged) is allowed because it produces a full-weight model after training. ++ if is_embedding_model_flag and transformed_spec.training.finetuning_type == FinetuningType.LORA: ++ raise PlatformJobCompilationError( ++ "NeMo Platform does not support unmerged LoRA for embedding models because the embedding NIM requires ONNX format, " ++ "which cannot represent standalone adapters. " ++ "Use peft with merge=True (lora_merged) or omit peft for all_weights training." ++ ) ++ ++ # Extract chat_template and tool_call_config from the source model entity's spec ++ # (populated from fileset metadata by the model-spec-runner background task). ++ # These are propagated to: ++ # 1. The training step config (chat_template takes highest priority in template resolution) ++ # 2. The output fileset metadata (so the model-spec-runner sets them on the output model) ++ fileset_metadata = _build_output_fileset_metadata(me) ++ file_io_upload_config = _build_file_upload_config(transformed_spec.output.fileset, fileset_metadata) ++ ++ # Build model_entity config for creating the model entity ++ trust_remote_code = me.trust_remote_code or False ++ model_entity_config = _build_model_entity_config(workspace, transformed_spec, trust_remote_code) ++ ++ steps = [ ++ # Step 1: Download model and dataset files from Files service ++ PlatformJobStep( ++ name="model-and-dataset-download", ++ executor=CPUExecutionProviderSpec( ++ provider="cpu", ++ container=ContainerSpec( ++ image=get_tasks_image(), ++ command=["nemo-platform", "run", "task", "--task", "nmp.automodel.tasks.file_io"], ++ ), ++ resources=cpu_resources, ++ ), ++ environment=base_env, ++ config=file_io_download_config.model_dump(mode="json"), ++ ), ++ # Step 2: Training job ++ compile_training_step( ++ transformed_spec, ++ base_env, ++ me, ++ teacher_me=teacher_me, ++ ), ++ # Step 3: Upload customized model ++ PlatformJobStep( ++ name="model-upload", ++ executor=CPUExecutionProviderSpec( ++ provider="cpu", ++ container=ContainerSpec( ++ image=get_tasks_image(), ++ command=["nemo-platform", "run", "task", "--task", "nmp.automodel.tasks.file_io"], ++ ), ++ resources=cpu_resources, ++ ), ++ environment=base_env, ++ config=file_io_upload_config.model_dump(mode="json"), ++ ), ++ # Step 4: Create model entity ++ PlatformJobStep( ++ name="model-entity-creation", ++ executor=CPUExecutionProviderSpec( ++ provider="cpu", ++ container=ContainerSpec( ++ image=get_tasks_image(), ++ command=["nemo-platform", "run", "task", "--task", "nmp.automodel.tasks.model_entity"], ++ ), ++ resources=cpu_resources, ++ ), ++ environment=base_env, ++ config=model_entity_config.model_dump(mode="json"), ++ ), ++ ] ++ ++ return PlatformJobSpec(steps=steps) +diff --git a/services/automodel/src/nmp/automodel/app/jobs/context.py b/services/automodel/src/nmp/automodel/app/jobs/context.py +new file mode 100644 +index 000000000..e518e8d65 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/context.py +@@ -0,0 +1,82 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import os ++from dataclasses import dataclass ++from pathlib import Path ++from typing import Self ++ ++from nmp.common.entities.constants import DEFAULT_WORKSPACE ++from nmp.common.jobs.constants import ( ++ DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH, ++ NEMO_JOB_ATTEMPT_ID_ENVVAR, ++ NEMO_JOB_ID_ENVVAR, ++ NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, ++ NEMO_JOB_STEP_ENVVAR, ++ NEMO_JOB_TASK_ENVVAR, ++ NEMO_JOB_WORKSPACE_ENVVAR, ++ PERSISTENT_JOB_STORAGE_PATH_ENVVAR, ++) ++from nmp.automodel.app.constants import ( ++ DEFAULT_JOB_STORAGE_PATH, ++ NMP_FILES_URL_ENVVAR, ++ NMP_JOBS_URL_ENVVAR, ++) ++ ++DEFAULT_JOB_ID = "unknown-job-id" ++DEFAULT_ATTEMPT_ID = "attempt-0" ++DEFAULT_STEP = "unknown-step" ++DEFAULT_TASK = "unknown-task" ++ ++ ++# Jobs task names should comply with NAME_PATTERN of EntityCreateInput.name for the Jobs API. ++# Generated tasks in k8s don't start with a lowercase letter per NAME_PATTERN, so we normalize ++# by adding the prefix when missing. ++# In Docker environment core/jobs/src/nmp/core/jobs/controllers/backends/docker.py, ++# tasks are prefixed with `task-` by default: task_id = f"task-{uuid.uuid4().hex}" ++def _normalize_task_name(task: str) -> str: ++ """Ensure task name uses the expected Jobs prefix.""" ++ if task.startswith("task-"): ++ return task ++ return f"task-{task}" ++ ++ ++@dataclass(frozen=True) ++class NMPJobContext: ++ """NeMo Platform Job context populated from Job Controller environment variables""" ++ ++ workspace: str ++ job_id: str ++ attempt_id: str ++ step: str ++ task: str ++ ++ # Service URLs ++ jobs_url: str | None ++ files_url: str | None ++ ++ # Storage paths ++ storage_path: Path ++ config_path: Path ++ ++ @property ++ def normalized_task(self) -> str: ++ """Task normalized for Jobs API compatibility.""" ++ return _normalize_task_name(self.task) ++ ++ @classmethod ++ def from_env(cls) -> Self: ++ """Create a NMPJobContext from environment variables""" ++ return cls( ++ workspace=os.environ.get(NEMO_JOB_WORKSPACE_ENVVAR, DEFAULT_WORKSPACE), ++ job_id=os.environ.get(NEMO_JOB_ID_ENVVAR, DEFAULT_JOB_ID), ++ attempt_id=os.environ.get(NEMO_JOB_ATTEMPT_ID_ENVVAR, DEFAULT_ATTEMPT_ID), ++ step=os.environ.get(NEMO_JOB_STEP_ENVVAR, DEFAULT_STEP), ++ task=os.environ.get(NEMO_JOB_TASK_ENVVAR, DEFAULT_TASK), ++ jobs_url=os.environ.get(NMP_JOBS_URL_ENVVAR), ++ files_url=os.environ.get(NMP_FILES_URL_ENVVAR), ++ storage_path=Path(os.environ.get(PERSISTENT_JOB_STORAGE_PATH_ENVVAR, DEFAULT_JOB_STORAGE_PATH)), ++ config_path=Path( ++ os.environ.get(NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH) ++ ), ++ ) +diff --git a/services/automodel/src/nmp/automodel/app/jobs/file_io/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/file_io/schemas.py +new file mode 100644 +index 000000000..c83da1ba5 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/file_io/schemas.py +@@ -0,0 +1,181 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from dataclasses import dataclass ++from enum import StrEnum ++from typing import Optional ++ ++from pydantic import BaseModel, Field, model_validator ++ ++FILESET_PROTOCOL = "fileset://" ++ ++ ++class TaskStatus(StrEnum): ++ """Status of a file I/O task.""" ++ ++ RUNNING = "running" ++ COMPLETED = "completed" ++ ERROR = "error" ++ ++ ++class TaskPhase(StrEnum): ++ """Phase of a file I/O task.""" ++ ++ DOWNLOADING = "downloading" ++ UPLOADING = "uploading" ++ COMPLETED = "completed" ++ ++ ++class FileSetRef(BaseModel): ++ """Reference to a FileSet.""" ++ ++ # workspace is optional because at compile time, the workspace is not known. ++ # None tells the file_io task to use the job's workspace from the NMPJobContext. ++ workspace: Optional[str] = None ++ name: str ++ ++ def __str__(self) -> str: ++ if self.workspace is None: ++ return f"{self.name}" ++ return f"{self.workspace}/{self.name}" ++ ++ def __repr__(self) -> str: ++ return f"FileSetRef(workspace={self.workspace}, name={self.name})" ++ ++ @classmethod ++ def _parse_string_parts(cls, ref: str) -> tuple[Optional[str], str] | None: ++ """Parse a FileSet reference string into a tuple of workspace and name.""" ++ if len(ref) == 0: ++ return None ++ if ref.startswith(FILESET_PROTOCOL): ++ ref = ref[len(FILESET_PROTOCOL) :] ++ parts = ref.split("/", 1) ++ if len(parts) == 1: ++ return None, parts[0] ++ if len(parts) == 2: ++ return parts[0], parts[1] ++ ++ @classmethod ++ def extract_name(cls, ref: str) -> str: ++ """Extract the fileset/entity name from a reference string. ++ ++ Supports: ++ - workspace/name ++ - name ++ - fileset://workspace/name (legacy, stripped) ++ """ ++ return cls.model_validate(ref).name ++ ++ @model_validator(mode="before") ++ @classmethod ++ def _convert_string_input(cls, v: str) -> dict: ++ """Convert a FileSet reference string into a dict of workspace and name. ++ ++ This makes it possible to create a FileSetRef from a string directly. ++ """ ++ if isinstance(v, str): ++ result = cls._parse_string_parts(v) ++ if result is None: ++ raise ValueError(f"Invalid FileSet reference: {v}. Expected format: workspace/name") ++ workspace, name = result ++ return {"workspace": workspace, "name": name} ++ return v ++ ++ ++class DownloadItem(BaseModel): ++ """Configures a single download: fileset -> local path. ++ ++ Note: dest is an absolute path where files will be downloaded. ++ This path should be under the job's shared storage (e.g., /var/run/scratch/job/model). ++ """ ++ ++ src: FileSetRef = Field( ++ description="FileSet reference for the source files. " ++ "Accepts 'workspace/name' or 'name' (job workspace used when omitted)." ++ ) ++ dest: str = Field( ++ default=".", description="Absolute destination path for downloaded files (e.g., '/var/run/scratch/job/model')." ++ ) ++ ++ ++class UploadItem(BaseModel): ++ """Configures a single upload: local path -> fileset.""" ++ ++ src: str = Field( ++ description="Absolute source path for files to upload (e.g., '/var/run/scratch/job/output_model')." ++ ) ++ dest: FileSetRef = Field( ++ description="FileSet reference for the destination. " ++ "Accepts 'workspace/name' or 'name' (job workspace used when omitted)." ++ ) ++ metadata: Optional[dict] = Field( ++ default=None, ++ description="Optional metadata to set on the created fileset (e.g., tool_calling config " ++ "propagated from the source model entity).", ++ ) ++ ++ ++class FileIOTaskConfig(BaseModel): ++ """Configuration for the file_io task. ++ ++ Used when running: python -m nmp.automodel.tasks.file_io ++ """ ++ ++ download: list[DownloadItem] = Field(default_factory=list, description="List of FileSets to download.") ++ upload: list[UploadItem] = Field(default_factory=list, description="List of files to upload to FileSets.") ++ ++ ++class TaskCompilationError(Exception): ++ """Error compiling a task configuration.""" ++ ++ pass ++ ++ ++class FileDownloadError(Exception): ++ """Error downloading files from Files service.""" ++ ++ pass ++ ++ ++class FileUploadError(Exception): ++ """Error uploading files to Files service.""" ++ ++ pass ++ ++ ++class ProgressReportError(Exception): ++ """Error reporting progress to the Jobs service.""" ++ ++ pass ++ ++ ++class PathTraversalError(ValueError): ++ """Error when a path attempts to escape the allowed base directory. ++ ++ This is a security error raised when user-provided paths like '../..' would ++ result in file operations outside the designated storage directory. ++ """ ++ ++ pass ++ ++ ++@dataclass ++class FileStats: ++ """Statistics for a file operation.""" ++ ++ total_bytes: int = 0 ++ failed_files: int = 0 ++ ++ ++@dataclass ++class DownloadStats(FileStats): ++ """Statistics for a download operation.""" ++ ++ files_downloaded: int = 0 ++ ++ ++@dataclass ++class UploadStats(FileStats): ++ """Statistics for a upload operation.""" ++ ++ files_uploaded: int = 0 +diff --git a/services/automodel/src/nmp/automodel/app/jobs/model_entity/__init__.py b/services/automodel/src/nmp/automodel/app/jobs/model_entity/__init__.py +new file mode 100644 +index 000000000..c5ddfda4d +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/model_entity/__init__.py +@@ -0,0 +1,11 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Model entity job configuration.""" ++ ++from .schemas import ModelEntityCreationError, ModelEntityTaskConfig ++ ++__all__ = [ ++ "ModelEntityCreationError", ++ "ModelEntityTaskConfig", ++] +diff --git a/services/automodel/src/nmp/automodel/app/jobs/model_entity/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/model_entity/schemas.py +new file mode 100644 +index 000000000..b2cd122d2 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/model_entity/schemas.py +@@ -0,0 +1,106 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Schemas for the model_entity task configuration.""" ++ ++from typing import Optional ++ ++from nmp.automodel.app.jobs.file_io.schemas import FileSetRef ++from nmp.automodel.entities.values import FinetuningType ++from pydantic import BaseModel, Field ++ ++ ++class ToolCallConfig(BaseModel): ++ """Tool calling configuration for NIM deployments.""" ++ ++ tool_call_parser: Optional[str] = Field(default=None, description="Name of the tool call parser to use.") ++ tool_call_plugin: Optional[str] = Field( ++ default=None, ++ pattern=r"^[\w\-.]+/[\w\-.]+$", ++ description="Reference to a fileset containing the custom tool call plugin Python file. " ++ "Expected format: '{workspace}/{fileset_name}'.", ++ ) ++ auto_tool_choice: Optional[bool] = Field(default=None, description="Whether to enable automatic tool choice.") ++ ++ ++class DeploymentParameters(BaseModel): ++ """Inline deployment parameters for creating a new ModelDeploymentConfig.""" ++ ++ gpu: int = Field(default=1, description="Number of GPUs required for deployment") ++ additional_envs: Optional[dict[str, str]] = Field( ++ default=None, ++ description="Additional environment variables for deployment", ++ ) ++ disk_size: Optional[str] = Field(default=None, description="Disk size for deployment") ++ image_name: Optional[str] = Field( ++ default=None, ++ description="Container image name from NGC. Defaults to multi-llm when unset", ++ ) ++ image_tag: Optional[str] = Field(default=None, description="Container image tag from NGC") ++ lora_enabled: bool = Field( ++ default=True, ++ description=( ++ "When auto-deploying full SFT training, setting this true allows " ++ "subsequent LoRA adapters to be deployed against the model." ++ ), ++ ) ++ tool_call_config: Optional[ToolCallConfig] = Field( ++ default=None, ++ description="Tool calling configuration override for the NIM deployment.", ++ ) ++ ++ ++class PEFTConfig(BaseModel): ++ """PEFT configuration for LoRA and LoRA-merged fine-tuning.""" ++ ++ type: FinetuningType ++ rank: int ++ alpha: int ++ ++ ++class ModelEntityTaskConfig(BaseModel): ++ """Configuration for the model_entity task. ++ ++ Used when running: python -m nmp.automodel.tasks.model_entity ++ """ ++ ++ name: str = Field( ++ description="Name of the model entity to create", ++ ) ++ workspace: str = Field( ++ description="Workspace of the model entity to create", ++ ) ++ description: Optional[str] = Field( ++ default=None, ++ description="Optional description of the model", ++ ) ++ fileset: FileSetRef = Field( ++ description="FileSet reference containing the customized model artifacts", ++ ) ++ model_entity: str = Field(..., description="The model entity this model was based on.") ++ base_model: Optional[str] = Field( ++ default=None, ++ description="Link to the base model used for customization", ++ ) ++ peft: Optional[PEFTConfig] = Field( ++ default=None, ++ description="PEFT configuration. Set for LoRA/LoRA-merged, None for full SFT.", ++ ) ++ ++ trust_remote_code: bool = Field( ++ default=False, ++ description="Whether to trust remote code for the checkpoint, propagated from the source model entity.", ++ ) ++ ++ deployment_config: Optional[str | DeploymentParameters] = Field( ++ default=None, ++ description="Deployment configuration. A string references an existing ModelDeploymentConfig " ++ "by name. An object provides inline NIM deployment parameters. " ++ "Omit to skip deployment.", ++ ) ++ ++ ++class ModelEntityCreationError(Exception): ++ """Error creating model entity.""" ++ ++ pass +diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py +new file mode 100644 +index 000000000..8d1b7e3b6 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py +@@ -0,0 +1,398 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Training step compiler.""" ++ ++import logging ++ ++from nemo_platform.types.models.model_entity import ModelEntity ++from nmp.automodel.api.v2.jobs.schemas import ( ++ AnyTraining, ++ CustomizationJobOutput, ++ DistillationTraining, ++ LoRAParams, ++ MLflowParams, ++ WandBParams, ++) ++from nmp.automodel.app.constants import ( ++ DEFAULT_DATASET_PATH, ++ DEFAULT_MODEL_PATH, ++ DEFAULT_TEACHER_MODEL_PATH, ++ V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, ++) ++from nmp.automodel.app.jobs.training.schemas import ( ++ DistillationConfig, ++ LoRAConfig, ++ MLflowConfig, ++ ModelConfig, ++ TrainingStepConfig, ++ WandBConfig, ++) ++from nmp.automodel.config import config ++from nmp.automodel.entities.values import Precision, TrainingType ++from nmp.common.jobs.api_factory import ( ++ ContainerSpec, ++ DistributedGPUExecutionProviderSpec, ++ EnvironmentVariable, ++ EnvironmentVariableFromSecret, ++ GPUExecutionProviderSpec, ++ PlatformJobStep, ++ ResourcesSpec, ++ StepLifecycle, ++) ++from nmp.automodel.images import get_training_image ++from nmp.common.model_utils import is_embedding_model ++ ++logger = logging.getLogger(__name__) ++ ++ ++def _resolve_is_embedding_model(me: ModelEntity) -> bool: ++ """Resolve embedding flag while preserving compatibility with legacy specs.""" ++ if me.spec is None: ++ return is_embedding_model(me.name) ++ ++ # Do not rely on `me.spec is not None` alone: ++ # older persisted ModelSpec payloads may not include `is_embedding_model`. ++ # Pydantic fills missing fields with the default (False), which would ++ # incorrectly classify legacy embedding models as LLMs. ++ model_fields_set = getattr(me.spec, "model_fields_set", getattr(me.spec, "__fields_set__", set())) ++ if "is_embedding_model" not in model_fields_set: ++ return is_embedding_model(me.name) ++ ++ return me.spec.is_embedding_model or False ++ ++ ++def _resolve_v4_compatible(me: ModelEntity) -> bool: ++ """Check if the model requires transformers-v4-compatible checkpoint output.""" ++ if me.spec is None: ++ return False ++ checkpoint_model_name = getattr(me.spec, "checkpoint_model_name", None) ++ is_v4_compatible = checkpoint_model_name in V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES ++ logger.info(f"Checkpoint model name {checkpoint_model_name} is v4 compatible: {is_v4_compatible}") ++ return is_v4_compatible ++ ++ ++def _resolve_custom_implementation_override(me: ModelEntity) -> bool: ++ if me.spec is None: ++ return False ++ ++ checkpoint_model_name = getattr(me.spec, "checkpoint_model_name", None) ++ if checkpoint_model_name == "NemotronHForCausalLM" and getattr(me.spec, "moe_config", None) is None: ++ # V2 Model is being used, v3 uses MoE - However V2 gets recognized as V3 and fails ++ return True ++ ++ if ( ++ checkpoint_model_name == "MistralForCausalLM" ++ and getattr(me.spec, "family", None) == "mistral" ++ and getattr(me.spec, "is_chat", False) ++ ): ++ # Mistral 7b v0.3 Instruct has the custom tokenizer implementation fail with: ++ """2026-03-02 18:35:51 | INFO | root | Using model config to instantiate tokenizer ++ 2026-03-02 18:35:53 | INFO | nemo_automodel._transformers.auto_tokenizer | Using custom tokenizer MistralCommonBackend for model type 'mistral' ++ 2026-03-02 18:35:53 | WARNING | nemo_automodel._transformers.tokenization.tokenization_mistral_common | Multiple tokenizer files found in directory: /var/run/scratch/job/model. Using tokenizer.model.v3. ++ Instantiation failed for `ColumnMappedTextInstructionDataset` ++ Accepted signature : (path_or_dataset_id: Union[str, List[str]], column_mapping: Dict[str, str], tokenizer, *, split: Optional[str] = 'train', name: Optional[str] = None, answer_only_loss_mask: bool = True, seq_length: Optional[int] = None, padding: Union[str, bool] = 'do_not_pad', truncation: Union[str, bool] = 'do_not_truncate', limit_dataset_samples: Optional[int] = None, use_hf_chat_template: bool = False) -> None ++ Positional args : () ++ Keyword args : { 'answer_only_loss_mask': True, ++ 'column_mapping': {'answer': 'completion', 'question': 'prompt'}, ++ 'padding': 'do_not_pad', ++ 'path_or_dataset_id': '/run/scratch/job/training/dataset/train.jsonl', ++ 'seq_length': 1024, ++ 'split': 'train', ++ 'tokenizer': '******', ++ 'truncation': 'longest_first'} ++ Exception : piece id is out of range. ++ """ ++ return True ++ ++ return False ++ ++ ++def compile_training_step( ++ job_spec: CustomizationJobOutput, ++ base_env: list[EnvironmentVariable], ++ me: ModelEntity, ++ teacher_me: ModelEntity | None = None, ++) -> PlatformJobStep: ++ """Compile job input to a PlatformJobStep for training. ++ ++ Args: ++ job_spec: The customization job output specification. ++ base_env: Base environment variables for the job step. ++ me: The model entity being trained. ++ teacher_me: Optional teacher model entity for knowledge distillation jobs. ++ ++ """ ++ job_spec.validate_for_training() ++ if TrainingType(job_spec.training.type) == TrainingType.DPO: ++ raise ValueError("DPO training is not supported by nmp-automodel") ++ trust_remote_code = me.trust_remote_code or False ++ chat_template = me.spec.chat_template if me.spec else None ++ is_embedding_model = _resolve_is_embedding_model(me) ++ override_custom_impl = _resolve_custom_implementation_override(me) ++ v4_compatible = _resolve_v4_compatible(me) ++ training = job_spec.training ++ p = training.parallelism ++ num_gpus_per_node = p.num_gpus_per_node ++ ++ training_config = TrainingStepConfig( ++ model=_translate_model_config( ++ job_spec, ++ DEFAULT_MODEL_PATH, ++ trust_remote_code=trust_remote_code, ++ is_embedding_model=is_embedding_model, ++ chat_template=chat_template, ++ override_custom_impl=override_custom_impl, ++ v4_compatible=v4_compatible, ++ ), ++ dataset=TrainingStepConfig.DatasetConfig( ++ path=DEFAULT_DATASET_PATH, ++ ), ++ training=_translate_training_config(training, me, teacher_me=teacher_me), ++ schedule=TrainingStepConfig.ScheduleConfig( ++ epochs=training.epochs, ++ max_steps=training.max_steps, ++ val_check_interval=training.val_check_interval, ++ ), ++ batch=TrainingStepConfig.BatchConfig( ++ global_batch_size=training.batch_size, ++ micro_batch_size=training.micro_batch_size, ++ sequence_packing=training.sequence_packing, ++ ), ++ optimizer=TrainingStepConfig.OptimizerConfig( ++ learning_rate=training.learning_rate, ++ min_learning_rate=training.min_learning_rate, ++ weight_decay=training.weight_decay, ++ beta1=training.adam_beta1, ++ beta2=training.adam_beta2, ++ warmup_steps=training.warmup_steps, ++ ), ++ parallelism=TrainingStepConfig.ParallelismConfig( ++ num_nodes=p.num_nodes, ++ num_gpus_per_node=num_gpus_per_node, ++ tensor_parallel_size=p.tensor_parallel_size, ++ pipeline_parallel_size=p.pipeline_parallel_size, ++ context_parallel_size=p.context_parallel_size, ++ expert_parallel_size=p.expert_parallel_size, ++ sequence_parallel=p.sequence_parallel, ++ ), ++ integrations=_translate_integrations(job_spec), ++ output_model=job_spec.output.name, ++ ) ++ ++ container = ContainerSpec( ++ image=_get_training_image(), ++ command=["python", "-m", "nmp.automodel.tasks.training"], ++ ) ++ ++ profile = ( ++ training.execution_profile ++ if training.execution_profile is not None ++ else config.default_training_execution_profile ++ ) ++ ++ if p.num_nodes > 1: ++ logger.debug(f"Using distributed GPU executor: num_nodes={p.num_nodes}, num_gpus_per_node={num_gpus_per_node}") ++ executor = DistributedGPUExecutionProviderSpec( ++ provider="gpu_distributed", ++ profile=profile, ++ container=container, ++ resources=ResourcesSpec( ++ num_gpus=num_gpus_per_node, ++ num_nodes=p.num_nodes, ++ ), ++ ) ++ else: ++ logger.debug(f"Using single-node GPU executor: num_gpus={num_gpus_per_node}") ++ executor = GPUExecutionProviderSpec( ++ provider="gpu", ++ profile=profile, ++ container=container, ++ resources=ResourcesSpec( ++ num_gpus=num_gpus_per_node, ++ ), ++ ) ++ ++ secret_envs = _collect_integration_secret_envs(job_spec) ++ ++ return PlatformJobStep( ++ name="customization-training-job", ++ executor=executor, ++ environment=[*base_env, *secret_envs, EnvironmentVariable(name="HF_DATASETS_OFFLINE", value="1")], ++ config=training_config.model_dump(mode="json"), ++ lifecycle=StepLifecycle(staleness_timeout_seconds=config.training_staleness_timeout_seconds), ++ ) ++ ++ ++def _translate_model_config( ++ job_spec: CustomizationJobOutput, ++ path: str, ++ trust_remote_code: bool = False, ++ is_embedding_model: bool = False, ++ chat_template: str | None = None, ++ override_custom_impl: bool = False, ++ v4_compatible: bool = False, ++) -> ModelConfig: ++ """Translate job spec to internal ModelConfig.""" ++ training = job_spec.training ++ return ModelConfig( ++ path=path, ++ name=_extract_model_name(job_spec), ++ max_seq_length=training.max_seq_length, ++ precision=training.precision, ++ trust_remote_code=trust_remote_code, ++ is_embedding_model=is_embedding_model, ++ chat_template=chat_template, ++ override_custom_impl=override_custom_impl, ++ v4_compatible=v4_compatible, ++ ) ++ ++ ++def _translate_training_config( ++ training: AnyTraining, ++ me: ModelEntity, ++ teacher_me: ModelEntity | None = None, ++) -> TrainingStepConfig.TrainingConfig: ++ """Translate API training method to internal TrainingConfig. ++ ++ Args: ++ training: The API training configuration. ++ me: The primary model entity. ++ teacher_me: Teacher model entity, populated for distillation jobs. ++ """ ++ training_type = TrainingType(training.type) ++ lora = _translate_lora_config(training.peft, me) if isinstance(training.peft, LoRAParams) else None ++ ++ kd = None ++ if isinstance(training, DistillationTraining): ++ teacher_trust_remote_code = (teacher_me.trust_remote_code or False) if teacher_me else False ++ kd = DistillationConfig( ++ teacher_model=ModelConfig( ++ path=DEFAULT_TEACHER_MODEL_PATH, ++ name=training.teacher_model, ++ precision=Precision(training.teacher_precision), ++ trust_remote_code=teacher_trust_remote_code, ++ ), ++ ratio=training.distillation_ratio, ++ temperature=training.distillation_temperature, ++ ) ++ ++ return TrainingStepConfig.TrainingConfig( ++ training_type=training_type, ++ finetuning_type=training.finetuning_type, ++ lora=lora, ++ kd=kd, ++ ) ++ ++ ++def _translate_lora_config(api_lora: LoRAParams, me: ModelEntity) -> LoRAConfig: ++ """Translate API LoRAConfig to internal LoRAConfig.""" ++ lora = LoRAConfig( ++ rank=api_lora.rank, ++ alpha=api_lora.alpha, ++ dropout=api_lora.dropout, ++ target_modules=api_lora.target_modules, ++ use_triton=True, ++ ) ++ ++ if not lora.target_modules: ++ if me.spec and me.spec.checkpoint_model_name == "NemotronHForCausalLM": ++ # Need to remove out_proj from the list of target modules ++ modules = set() ++ if me.spec.linear_layers: ++ for ll in me.spec.linear_layers: ++ m = ll.name.split(".")[-1] ++ if m.endswith("proj"): ++ modules.add(f"*.{m}") ++ modules.discard("*.out_proj") ++ ++ # In cases when model_spec has linear_layers as null, we need to set the target_modules to default ++ # If target_modules is empty we get this error during training: ++ # Expected match_all_linear to be true or target_modules/exclude_modules to be non-empty ++ lora.target_modules = list(modules) if modules else ["*proj"] ++ else: ++ lora.target_modules = ["*proj"] ++ return lora ++ ++ ++def _translate_wandb_config(api_wandb: WandBParams | None) -> WandBConfig | None: ++ """Translate API WandBParams to internal WandBConfig.""" ++ if api_wandb is None: ++ return None ++ ++ return WandBConfig( ++ project=api_wandb.project, ++ name=api_wandb.name, ++ entity=api_wandb.entity, ++ tags=api_wandb.tags, ++ notes=api_wandb.notes, ++ base_url=api_wandb.base_url, ++ ) ++ ++ ++def _translate_mlflow_config(api_mlflow: MLflowParams | None) -> MLflowConfig | None: ++ """Translate API MLflowParams to internal MLflowConfig.""" ++ if api_mlflow is None: ++ return None ++ ++ return MLflowConfig( ++ experiment_name=api_mlflow.experiment_name, ++ run_name=api_mlflow.run_name, ++ tags=api_mlflow.tags, ++ description=api_mlflow.description, ++ tracking_uri=api_mlflow.tracking_uri, ++ ) ++ ++ ++def _translate_integrations(job_spec: CustomizationJobOutput) -> TrainingStepConfig.IntegrationsConfig: ++ """Translate API IntegrationsConfig to internal IntegrationsConfig.""" ++ if not job_spec.integrations: ++ return TrainingStepConfig.IntegrationsConfig() ++ ++ return TrainingStepConfig.IntegrationsConfig( ++ wandb=_translate_wandb_config(job_spec.integrations.wandb), ++ mlflow=_translate_mlflow_config(job_spec.integrations.mlflow), ++ ) ++ ++ ++def _collect_integration_secret_envs(job_input: CustomizationJobOutput) -> list[EnvironmentVariable]: ++ """Collect secret environment variables from integration configs. ++ ++ Secrets are propagated via PlatformJobStep.environment (not config) so that ++ the Jobs service can resolve secret references at runtime. ++ """ ++ secret_envs: list[EnvironmentVariable] = [] ++ if not job_input.integrations: ++ return secret_envs ++ ++ if job_input.integrations.wandb and job_input.integrations.wandb.api_key_secret: ++ secret_envs.append( ++ EnvironmentVariable( ++ name="WANDB_API_KEY", ++ from_secret=EnvironmentVariableFromSecret( ++ name=job_input.integrations.wandb.api_key_secret.root, ++ ), ++ ) ++ ) ++ ++ return secret_envs ++ ++ ++def _extract_model_name(job_spec: CustomizationJobOutput) -> str | None: ++ """Extract the canonical model name from the model field for template lookup. ++ ++ The model name follows the pattern "workspace/name" (e.g., "meta/llama-3.1-8b-instruct") ++ which matches the keys in DEFAULT_CHAT_TEMPLATES. ++ """ ++ model = job_spec.model ++ ++ if "/" in model: ++ logger.debug(f"Extracted model name from URN: {model}") ++ return model ++ ++ return None ++ ++ ++def _get_training_image() -> str: ++ """Training container image for the Automodel task.""" ++ return config.training_automodel_image or get_training_image() +diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py +new file mode 100644 +index 000000000..93d8ae7f4 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py +@@ -0,0 +1,293 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from enum import Enum ++from typing import Optional ++ ++from nmp.automodel.app.constants import ( ++ DEFAULT_OUTPUT_MODEL_PATH, ++ DEFAULT_SEED, ++ DEFAULT_TRAINING_OUTPUT_PATH, ++) ++from nmp.automodel.entities.values import CheckpointFormat, FinetuningType, Precision, TrainingType ++from pydantic import BaseModel, Field ++ ++ ++class OptimizerType(str, Enum): ++ """Optimizer and scheduler combination types.""" ++ ++ ADAMW_WITH_COSINE_ANNEALING = "adamw_with_cosine_annealing" ++ ADAM_WITH_COSINE_ANNEALING = "adam_with_cosine_annealing" ++ ADAMW_WITH_FLAT_LR = "adamw_with_flat_lr" ++ ADAM_WITH_FLAT_LR = "adam_with_flat_lr" ++ ++ ++class LoRAConfig(BaseModel): ++ """Internal LoRA configuration with implementation details. ++ ++ This differs from the API LoRAParams: ++ - Includes use_triton, match_all_linear (implementation details) ++ - exclude_modules for advanced control ++ - Can add new fields freely without breaking API ++ """ ++ ++ # Core LoRA parameters (from API) ++ rank: int = Field(default=8, description="LoRA rank (low-rank dimension)") ++ alpha: int = Field(default=32, description="LoRA alpha scaling factor") ++ dropout: float = Field(default=0.0, description="LoRA dropout probability") ++ ++ # Module targeting ++ target_modules: Optional[list[str]] = Field( ++ default=None, description="Module name patterns to apply LoRA to (e.g., ['*.proj'])" ++ ) ++ exclude_modules: Optional[list[str]] = Field(default=None, description="Module name patterns to exclude from LoRA") ++ ++ # Implementation details (not in API) ++ use_triton: bool = Field(default=True, description="Use optimized Triton LoRA kernel") ++ ++ ++class ModelConfig(BaseModel): ++ """Internal model configuration.""" ++ ++ path: str = Field(description="Path to a model directory (contains config, weights, tokenizer etc.)") ++ name: Optional[str] = Field( ++ default=None, ++ description="Model identifier (e.g., 'meta/llama-3.1-8b-instruct')", ++ ) ++ max_seq_length: int = Field( ++ default=2048, ++ description="Maximum token sequence length for training; longer sequences are truncated", ++ ) ++ ++ # Model loading options ++ precision: Optional[Precision] = Field( ++ default=None, ++ description="Model weight dtype (e.g., 'bf16', 'fp16'). None implies auto-detects from model config", ++ ) ++ attn_implementation: Optional[str] = Field( ++ default="sdpa", ++ description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2' (requires flash-attn), 'eager' (no optimization)", ++ ) ++ trust_remote_code: bool = Field( ++ default=False, ++ description="Allow executing custom model code from the checkpoint. Required for some community models", ++ ) ++ is_embedding_model: bool = Field( ++ default=False, ++ description="Whether the model is an embedding model", ++ ) ++ chat_template: Optional[str] = Field( ++ default=None, ++ description="Jinja2 chat template from the model entity spec or fileset metadata. " ++ "Takes highest priority in resolve_chat_template when set.", ++ ) ++ ++ override_custom_impl: bool = Field( ++ default=False, ++ description="Some of the custom implementations in nemo automodel cause loading failures when used with other models in the same family, this forces the use_hf=True flag to use non custom implementations.", ++ ) ++ ++ v4_compatible: bool = Field( ++ default=False, ++ description="Enable transformers-v4-compatible checkpoint output that preserves the original transformers-v4-style config.json output.", ++ ) ++ ++ ++class DistillationConfig(BaseModel): ++ """Internal Knowledge Distillation configuration. ++ ++ teacher is a ModelConfig with resolved path, not a URN. ++ """ ++ ++ # Teacher model (resolved path) ++ teacher_model: ModelConfig = Field(description="Teacher model configuration with resolved path") ++ ++ # KD hyperparameters ++ ratio: float = Field(default=0.5, description="Balance between CE loss and KD loss") ++ temperature: float = Field(default=1.0, description="Softmax temperature for KD") ++ ++ # Implementation detail (not in API) ++ offload_teacher: bool = Field(default=False, description="Offload teacher model to CPU for memory efficiency") ++ ++ ++class EmbeddingConfig(BaseModel): ++ """Internal Embedding/Biencoder model finetuning configuration. ++ ++ This is used internally when a model is detected as an embedding model ++ by its name. The defaults here match the recommended settings for ++ NeMo embedding models. ++ ++ Note: Embedding models are detected by model name (e.g., contains 'embed'), ++ not by a separate training type. They use standard SFT training type. ++ ++ Model architecture parameters (share_encoder, pooling, l2_normalize, temperature, ++ add_linear_pooler, out_dimension) use sensible defaults and are not exposed here. ++ """ ++ ++ # Training configuration ++ train_n_passages: int = Field( ++ default=5, ++ description=( ++ "Total number of passages per query during training: 1 positive + (n-1) negatives. " ++ "For example, train_n_passages=5 means 1 positive and 4 negative passages per query." ++ ), ++ ) ++ eval_negative_size: Optional[int] = Field( ++ default=None, ++ description=( ++ "Number of negative passages per query during validation. " ++ "Recommended to keep as train_n_passages - 1 for consistent train/eval behavior. " ++ "If not set, defaults to train_n_passages - 1." ++ ), ++ ) ++ ++ # Memory optimization ++ do_gradient_checkpointing: bool = Field( ++ default=False, ++ description=( ++ "Enable gradient checkpointing to reduce memory usage at the cost of slower training. " ++ "Useful for larger embedding models or memory-constrained environments." ++ ), ++ ) ++ ++ # Tokenization configuration ++ query_max_length: int = Field(default=512, description="Maximum token length for query tokenization") ++ passage_max_length: int = Field(default=512, description="Maximum token length for passage tokenization") ++ query_prefix: str = Field(default="query:", description="Prefix to prepend to queries before tokenization") ++ passage_prefix: str = Field(default="passage:", description="Prefix to prepend to passages before tokenization") ++ ++ ++class WandBConfig(BaseModel): ++ """Internal Weights & Biases configuration.""" ++ ++ project: Optional[str] = Field(default=None, description="W&B project name") ++ name: Optional[str] = Field(default=None, description="W&B run name") ++ entity: Optional[str] = Field(default=None, description="W&B entity") ++ tags: Optional[list[str]] = Field(default=None, description="W&B tags") ++ notes: Optional[str] = Field(default=None, description="W&B notes") ++ base_url: Optional[str] = Field(default=None, description="Self-hosted W&B server URL") ++ ++ ++class MLflowConfig(BaseModel): ++ """Internal MLflow configuration.""" ++ ++ experiment_name: Optional[str] = Field(default=None, description="MLflow experiment name") ++ run_name: Optional[str] = Field(default=None, description="MLflow run name") ++ tags: Optional[dict[str, str]] = Field(default=None, description="MLflow tags") ++ description: Optional[str] = Field(default=None, description="MLflow description") ++ tracking_uri: Optional[str] = Field(default=None, description="MLflow tracking URI") ++ ++ ++class TrainingStepConfig(BaseModel): ++ """Normalized training configuration compiled into nemo-automodel recipe YAML.""" ++ ++ class DatasetConfig(BaseModel): ++ path: str ++ prompt_template: Optional[str] = None ++ add_bos: Optional[bool] = None ++ add_eos: Optional[bool] = None ++ ++ class TrainingConfig(BaseModel): ++ training_type: TrainingType ++ finetuning_type: Optional[FinetuningType] = None ++ lora: Optional[LoRAConfig] = None ++ kd: Optional[DistillationConfig] = None ++ ++ class ScheduleConfig(BaseModel): ++ epochs: int = 1 ++ max_steps: Optional[int] = None ++ val_check_interval: Optional[float] = None ++ ++ class BatchConfig(BaseModel): ++ global_batch_size: int = Field(default=32, gt=0) ++ micro_batch_size: int = Field(default=1, gt=0) ++ sequence_packing: bool = False ++ sequence_packing_max_samples: int = 1000 ++ ++ class OptimizerConfig(BaseModel): ++ optimizer_type: Optional[OptimizerType] = Field(default=None) ++ learning_rate: float = 1e-4 ++ min_learning_rate: Optional[float] = None ++ eps: float = 1e-8 ++ weight_decay: float = 0.01 ++ beta1: float = 0.9 ++ beta2: float = 0.999 ++ warmup_steps: int = 0 ++ ++ class ParallelismConfig(BaseModel): ++ num_nodes: int = 1 ++ num_gpus_per_node: int = 1 ++ tensor_parallel_size: int = 1 ++ pipeline_parallel_size: int = 1 ++ context_parallel_size: int = 1 ++ expert_parallel_size: Optional[int] = None ++ sequence_parallel: bool = False ++ ++ class IntegrationsConfig(BaseModel): ++ wandb: Optional[WandBConfig] = None ++ mlflow: Optional[MLflowConfig] = None ++ ++ # === Main Config Fields === ++ model: ModelConfig ++ dataset: DatasetConfig ++ training: TrainingConfig ++ schedule: ScheduleConfig ++ batch: BatchConfig ++ optimizer: OptimizerConfig ++ parallelism: ParallelismConfig ++ integrations: IntegrationsConfig = Field(default_factory=IntegrationsConfig) ++ ++ # === Output Paths === ++ output_model: str # Set at compile-time from CustomizationJobOutput ++ workspace_path: str = Field(default=DEFAULT_TRAINING_OUTPUT_PATH) ++ output_path: str = Field(default=DEFAULT_OUTPUT_MODEL_PATH) ++ ++ # === Miscellaneous === ++ seed: int = Field( ++ default=DEFAULT_SEED, description="Random seed for ensuring reproducibility in all random processes." ++ ) ++ training_timeout: Optional[int] = None ++ ++ ++class GPUInfo(BaseModel): ++ """GPU architecture information captured during training.""" ++ ++ architecture: str ++ device_name: str ++ memory_gb: float ++ cuda_version: str ++ ++ ++class CheckpointInfo(BaseModel): ++ """Output checkpoint information.""" ++ ++ path: str ++ format: CheckpointFormat ++ precision: Optional[Precision] = Field( ++ default=None, description="Checkpoint precision. None when auto-detected from model config." ++ ) ++ ++ ++class TrainingMetrics(BaseModel): ++ """Final training metrics.""" ++ ++ final_loss: Optional[float] = None ++ final_val_loss: Optional[float] = None ++ best_val_loss: Optional[float] = None ++ total_steps: int = 0 ++ total_epochs: int = 0 ++ ++ ++class TrainingResult(BaseModel): ++ """ ++ Result written by training task. ++ ++ Written to: {workspace_path}/training_result.json ++ """ ++ ++ success: bool ++ error_message: Optional[str] = None ++ checkpoint: Optional[CheckpointInfo] = None ++ gpu_info: Optional[GPUInfo] = None ++ metrics: TrainingMetrics = Field(default_factory=TrainingMetrics) ++ training_duration_seconds: Optional[float] = None +diff --git a/services/automodel/src/nmp/automodel/compile.py b/services/automodel/src/nmp/automodel/compile.py +new file mode 100644 +index 000000000..bf6bafe3f +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/compile.py +@@ -0,0 +1,36 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Public compile entrypoint for Automodel jobs.""" ++ ++from __future__ import annotations ++ ++from nmp.automodel.adapter import automodel_spec_to_compiler_output ++from nmp.automodel.api.v2.jobs.schemas import CustomizationJobOutput ++from nmp.automodel.app.jobs.compiler import platform_job_config_compiler as _compile_canonical ++ ++ ++async def platform_job_config_compiler( ++ job_spec: CustomizationJobOutput | object, ++ workspace: str, ++ sdk: object, ++ job_name: str | None = None, ++ profile: str | None = None, ++) -> object: ++ """Compile Automodel job spec (plugin or legacy shape) to PlatformJobSpec.""" ++ if not isinstance(job_spec, CustomizationJobOutput): ++ job_spec = automodel_spec_to_compiler_output(job_spec) ++ if profile and job_spec.training.execution_profile is None: ++ job_spec = job_spec.model_copy( ++ update={"training": job_spec.training.model_copy(update={"execution_profile": profile})}, ++ ) ++ return await _compile_canonical( ++ workspace, ++ job_spec, ++ sdk, # type: ignore[arg-type] ++ job_name=job_name, ++ profile=profile, ++ ) ++ ++ ++__all__ = ["platform_job_config_compiler", "automodel_spec_to_compiler_output"] +diff --git a/services/automodel/src/nmp/automodel/config.py b/services/automodel/src/nmp/automodel/config.py +new file mode 100644 +index 000000000..c6cd47957 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/config.py +@@ -0,0 +1,49 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Configuration for the nmp-automodel compiler and tasks.""" ++ ++from nmp.common.config import create_service_config_class, get_platform_config, get_service_config ++from pydantic import Field ++ ++ ++class AutomodelConfig(create_service_config_class("automodel")): # type: ignore ++ """Environment variables use the NMP_AUTOMODEL_ prefix.""" ++ ++ image_registry: str = Field( ++ default="nvcr.io/0921617854601259/nemo-platform-dev", ++ description=( ++ "Registry host/path prefix for nmp/automodel-tasks and nmp/automodel-training. " ++ "Override via NMP_AUTOMODEL_IMAGE_REGISTRY for other environments." ++ ), ++ ) ++ training_image: str | None = Field( ++ default=None, ++ description="Override GPU training image (default: nmp/automodel-training under image_registry).", ++ ) ++ tasks_image: str | None = Field( ++ default=None, ++ description="Override CPU tasks image (default: nmp/automodel-tasks under image_registry).", ++ ) ++ ++ default_job_resource_cpu_request: str = Field(default="1") ++ default_job_resource_memory_request: str = Field(default="8Gi") ++ default_job_resource_cpu_limit: str = Field(default="4") ++ default_job_resource_memory_limit: str = Field(default="16Gi") ++ ++ training_staleness_timeout_seconds: int = Field( ++ default=3600, ++ description="Terminate training if no task progress within this many seconds (0 disables).", ++ ) ++ ++ default_training_execution_profile: str = Field( ++ default="gpu", ++ description="Default GPU execution profile when the job spec omits training.execution_profile.", ++ ) ++ ++ ++config = get_service_config(AutomodelConfig) ++platform_config = get_platform_config() ++ ++# Legacy compiler attribute names ++config.training_automodel_image = config.training_image +diff --git a/services/automodel/src/nmp/automodel/entities/__init__.py b/services/automodel/src/nmp/automodel/entities/__init__.py +new file mode 100644 +index 000000000..13e49b937 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/entities/__init__.py +@@ -0,0 +1,29 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Customizer entity definitions. ++ ++This module exports: ++- Entity classes (database/persistence models) ++- Shared value types (enums and read-only metadata) ++ ++Configuration types (LoRAConfig, ModelConfig, etc.) are NOT exported here. ++They belong in their respective layers: ++- API types → api/v2/jobs/schemas.py ++- Internal types → app/jobs/training/schemas.py ++""" ++ ++from .values import ( ++ CheckpointFormat, ++ FinetuningType, ++ Precision, ++ TrainingType, ++) ++ ++__all__ = [ ++ # Enums ++ "CheckpointFormat", ++ "FinetuningType", ++ "Precision", ++ "TrainingType", ++] +diff --git a/services/automodel/src/nmp/automodel/entities/validators.py b/services/automodel/src/nmp/automodel/entities/validators.py +new file mode 100644 +index 000000000..d92d9c3ad +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/entities/validators.py +@@ -0,0 +1,48 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Shared validation logic for entity fields.""" ++ ++import re ++from typing import Optional ++ ++from nmp.common.entities.constants import REGEX_WORD_CHARACTER_DOT_DASH ++from nmp.automodel.app.jobs.file_io.schemas import FILESET_PROTOCOL, FileSetRef ++ ++_NAME_REGEX = re.compile(REGEX_WORD_CHARACTER_DOT_DASH) ++_UNSUPPORTED_PROTOCOLS = ("hf://", "ngc://", "s3://", "gs://") ++ ++ ++def _normalize_fileset_ref(uri: str) -> str: ++ """Parse and return canonical fileset reference (no ``fileset://`` prefix).""" ++ normalized = uri.strip() ++ for prefix in _UNSUPPORTED_PROTOCOLS: ++ if normalized.startswith(prefix): ++ raise ValueError( ++ f"Unsupported dataset URI protocol. Use 'workspace/name' or 'name' (resolved in the job workspace). Got: {uri}", ++ ) ++ if normalized.startswith(FILESET_PROTOCOL): ++ normalized = normalized[len(FILESET_PROTOCOL) :] ++ ref = FileSetRef.model_validate(normalized) ++ if not _NAME_REGEX.match(ref.name): ++ raise ValueError( ++ f"Invalid dataset name: '{ref.name}'. " ++ "Entity names must contain only word characters, dots, and hyphens.", ++ ) ++ return str(ref) ++ ++ ++def validate_fileset_uri(uri: str) -> str: ++ """Validate a fileset reference as ``workspace/name`` or ``name``. ++ ++ The job path ``workspace`` is used when the reference is a bare name. ++ A legacy ``fileset://`` prefix is accepted and stripped. ++ """ ++ return _normalize_fileset_ref(uri) ++ ++ ++def validate_optional_fileset_uri(uri: Optional[str]) -> Optional[str]: ++ """Validate fileset reference, allowing None.""" ++ if uri is None: ++ return None ++ return validate_fileset_uri(uri) +diff --git a/services/automodel/src/nmp/automodel/entities/values.py b/services/automodel/src/nmp/automodel/entities/values.py +new file mode 100644 +index 000000000..b236ac9b4 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/entities/values.py +@@ -0,0 +1,96 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Value types for the Customizer service.""" ++ ++from enum import Enum, StrEnum ++ ++ ++class CheckpointFormat(str, Enum): ++ """Model checkpoint format (input or output).""" ++ ++ HF = "hf" # Standard HuggingFace format ++ HF_PEFT = "hf-peft" # HuggingFace PEFT adapter (LoRA, etc.) ++ NEMO = "nemo" # NeMo checkpoint format ++ ++ ++class Precision(str, Enum): ++ """Model precision for training.""" ++ ++ FP8 = "fp8" ++ BF16 = "bf16" ++ FP16 = "fp16" ++ FP32 = "fp32" ++ ++ def to_torch_dtype(self) -> str: ++ """ ++ Convert to a torch dtype string compatible with HuggingFace/Automodel. ++ ++ Returns: ++ String like "bfloat16", "float16", "float32" that can be passed to ++ from_pretrained(torch_dtype=...) or Automodel's dtype_from_str(). ++ ++ Raises: ++ ValueError: If this precision cannot be represented as a torch dtype. ++ FP8 requires separate quantization config, BF16_MIXED is a training mode. ++ """ ++ mapping = { ++ Precision.BF16: "bfloat16", ++ Precision.FP16: "float16", ++ Precision.FP32: "float32", ++ } ++ if self not in mapping: ++ raise ValueError( ++ f"Precision '{self.value}' cannot be converted to a torch dtype. " ++ f"Supported: {[p.value for p in mapping.keys()]}. " ++ f"Note: FP8 requires separate quantization config, BF16_MIXED is a training mode." ++ ) ++ return mapping[self] ++ ++ @classmethod ++ def from_hf_dtype(cls, hf_dtype: str) -> "Precision": ++ """ ++ Create Precision from a HuggingFace torch_dtype string. ++ ++ Args: ++ hf_dtype: String like "bfloat16", "float16", "float32", "float". ++ ++ Returns: ++ Corresponding Precision enum value. ++ ++ Raises: ++ ValueError: If the dtype string is not recognized. ++ """ ++ mapping = { ++ "bfloat16": cls.BF16, ++ "float16": cls.FP16, ++ "float32": cls.FP32, ++ "float": cls.FP32, ++ } ++ if hf_dtype not in mapping: ++ raise ValueError(f"Unknown HuggingFace dtype '{hf_dtype}'. Supported: {list(mapping.keys())}") ++ return mapping[hf_dtype] ++ ++ ++class TrainingType(str, Enum): ++ """Training algorithm type.""" ++ ++ SFT = "sft" ++ DISTILLATION = "distillation" ++ DPO = "dpo" ++ GRPO = "grpo" ++ ++ ++class FinetuningType(str, Enum): ++ """Finetuning strategy (full weights vs PEFT).""" ++ ++ ALL_WEIGHTS = "all_weights" ++ LORA = "lora" ++ LORA_MERGED = "lora_merged" ++ ++ ++class OutputNameType(StrEnum): ++ """Output artifact type.""" ++ ++ ADAPTER = "adapter" ++ MODEL = "model" +diff --git a/services/automodel/src/nmp/automodel/images.py b/services/automodel/src/nmp/automodel/images.py +new file mode 100644 +index 000000000..b5406a5a3 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/images.py +@@ -0,0 +1,42 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Docker image resolution for nmp-automodel job steps.""" ++ ++from __future__ import annotations ++ ++from nmp.automodel.config import config ++from nmp.common.jobs.image import get_qualified_image ++ ++# Default NGC dev registry for platform-built automodel images (path under nmp/). ++DEFAULT_AUTOMODEL_IMAGE_REGISTRY = "nvcr.io/0921617854601259/nemo-platform-dev" ++ ++BASE_IMAGE_NAME = "nmp/automodel-base" ++TASKS_IMAGE_NAME = "nmp/automodel-tasks" ++TRAINING_IMAGE_NAME = "nmp/automodel-training" ++ ++ ++def get_automodel_qualified_image(name: str, override: str | None = None) -> str: ++ """Resolve a job step image reference. ++ ++ Args: ++ name: Repository path under the registry (e.g. ``nmp/automodel-tasks``). ++ override: Full image ref from ``NMP_AUTOMODEL_TASKS_IMAGE`` / ``NMP_AUTOMODEL_TRAINING_IMAGE``. ++ ++ Returns: ++ Fully qualified image (``{registry}/{name}:{tag}``) unless ``override`` is set. ++ """ ++ if override: ++ return override ++ registry = config.image_registry or DEFAULT_AUTOMODEL_IMAGE_REGISTRY ++ return get_qualified_image(name, registry=registry) ++ ++ ++def get_tasks_image() -> str: ++ """CPU task steps (file_io, model_entity).""" ++ return get_automodel_qualified_image(TASKS_IMAGE_NAME, config.tasks_image) ++ ++ ++def get_training_image() -> str: ++ """GPU training step.""" ++ return get_automodel_qualified_image(TRAINING_IMAGE_NAME, config.training_image) +diff --git a/services/automodel/src/nmp/automodel/platform_client.py b/services/automodel/src/nmp/automodel/platform_client.py +new file mode 100644 +index 000000000..d55672d1a +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/platform_client.py +@@ -0,0 +1,39 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from nemo_platform import AsyncNeMoPlatform ++from nemo_platform._exceptions import NotFoundError, PermissionDeniedError ++from nemo_platform.types.models import ModelEntity ++from nmp.automodel.app.jobs.file_io.schemas import FileSetRef ++from nmp.common.entities.utils import parse_entity_ref ++ ++ ++async def check_dataset_access(sdk: AsyncNeMoPlatform, dataset_uri: str, default_workspace: str) -> None: ++ """Verify the caller can access the dataset fileset.""" ++ ref = FileSetRef.model_validate(dataset_uri) ++ workspace = ref.workspace or default_workspace ++ try: ++ await sdk.files.filesets.retrieve(workspace=workspace, name=ref.name) ++ except PermissionDeniedError: ++ raise PermissionError(f"Access denied to dataset fileset '{workspace}/{ref.name}'") from None ++ except NotFoundError: ++ raise ValueError( ++ f"Dataset fileset '{ref.name}' not found in workspace '{workspace}'. Verify the dataset exists." ++ ) from None ++ ++ ++async def fetch_model_entity( ++ model_ref: str, ++ default_workspace: str, ++ sdk: AsyncNeMoPlatform, ++) -> ModelEntity: ++ """Retrieve a model entity by reference string.""" ++ resolved_ref = parse_entity_ref(model_ref, default_workspace) ++ try: ++ return await sdk.models.retrieve(name=resolved_ref.name, workspace=resolved_ref.workspace, verbose=True) ++ except PermissionDeniedError: ++ raise PermissionError(f"Access denied to model '{resolved_ref.workspace}/{resolved_ref.name}'") from None ++ except NotFoundError: ++ raise ValueError( ++ f"Model entity not found: '{resolved_ref.workspace}/{resolved_ref.name}'. Verify the model entity exists." ++ ) from None +diff --git a/services/automodel/src/nmp/automodel/tasks/__init__.py b/services/automodel/src/nmp/automodel/tasks/__init__.py +new file mode 100644 +index 000000000..63c713713 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/__init__.py +@@ -0,0 +1,4 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Automodel task entrypoints (see ``nmp.automodel.tasks.`` subpackages).""" +diff --git a/services/automodel/src/nmp/automodel/tasks/__main__.py b/services/automodel/src/nmp/automodel/tasks/__main__.py +new file mode 100644 +index 000000000..e0e9a498f +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/__main__.py +@@ -0,0 +1,11 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Entry point for running the file-io task as a module.""" ++ ++import sys ++ ++from nmp.automodel.tasks.file_io import run ++ ++if __name__ == "__main__": ++ sys.exit(run()) +diff --git a/services/automodel/src/nmp/automodel/tasks/docker/README.md b/services/automodel/src/nmp/automodel/tasks/docker/README.md +new file mode 100644 +index 000000000..b57974656 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/docker/README.md +@@ -0,0 +1,74 @@ ++# File I/O Task Docker Testing ++ ++Scripts for running the file_io task container locally. ++ ++## Prerequisites ++ ++1. **Build the Docker image** from the repository root: ++This will build `my-registry/nmp-cpu-tasks:local` image that will be used for this task. ++ ++ ```bash ++ cd /path/to/nmp ++ make docker/nmp-cpu-tasks ++ ``` ++ ++2. **Have NeMo Platform running** (files service) at `http://localhost:8080` ++ ++## Quick Start ++ ++### Run with Docker Compose ++ ++```bash ++cd services/customizer/src/nmp/customizer/tasks/file_io/docker ++ ++# Run the task ++docker compose up ++ ++# Run with custom image ++FILE_IO_IMAGE=my-registry/nmp-cpu-tasks:dev docker compose up ++ ++# Run interactively ++docker compose run --rm file-io run task --task nmp.customizer.tasks.file_io ++``` ++ ++## Configuration ++ ++### Environment Variables ++ ++| Variable | Description | Default | ++|----------|-------------|---------| ++| `NMP_BASE_URL` | Base URL for NeMo Platform | `http://host.docker.internal:8000` | ++| `NMP_FILES_URL` | Files service URL | `http://host.docker.internal:8000` | ++| `NMP_JOBS_URL` | Jobs service URL (for progress) | `http://host.docker.internal:8000` | ++| `NEMO_JOB_ID` | Job identifier | `test-file-io-job` | ++| `NEMO_JOB_STEP` | Step name | `FileIO` | ++| `NEMO_JOB_TASK` | Task identifier | `file-io-task` | ++| `NEMO_JOB_WORKSPACE` | Workspace name | `default` | ++| `LOG_LEVEL` | Logging level | `INFO` | ++| `FILE_IO_IMAGE` | Docker image to use | `my-registry/nmp-cpu-tasks:local` | ++ ++### Config File Format ++ ++The `sample_config.json` defines what files to upload/download: ++ ++```json ++{ ++ "upload": [ ++ { ++ "src": "local_folder", ++ "dest": "workspace/fileset-name" ++ } ++ ], ++ "download": [ ++ { ++ "src": "workspace/fileset-name", ++ "dest": "local_folder" ++ } ++ ] ++} ++``` ++ ++- `upload[].src`: Path relative to job storage defined by NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH (mounted at `/var/run/scratch`) ++- `upload[].dest`: Target FileSet in format `workspace/fileset-name` ++- `download[].src`: Source FileSet in format `workspace/fileset-name` ++- `download[].dest`: Path relative to job storage defined by NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH +diff --git a/services/automodel/src/nmp/automodel/tasks/docker/docker-compose.yaml b/services/automodel/src/nmp/automodel/tasks/docker/docker-compose.yaml +new file mode 100644 +index 000000000..59feeede7 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/docker/docker-compose.yaml +@@ -0,0 +1,52 @@ ++# Docker Compose for file_io task local testing ++# ++# Usage: ++# # Start the task (runs once and exits) ++# docker compose up ++# ++# # Run with custom command ++# docker compose run --rm file-io run task --task nmp.customizer.tasks.file_io ++# ++# Prerequisites: ++# - Build the image first (from Platform repo root): ++# docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-tasks-docker ++# - Have NeMo Platform running at http://localhost:8080 ++# - Create sample_config.json (or use the one provided) ++ ++services: ++ file-io: ++ image: ${FILE_IO_IMAGE:-nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-tasks:local} ++ container_name: file-io-task ++ ++ # Mount config file and storage directory ++ # Using test data from services/customizer/tests/tasks/file_io/data/ ++ # files will be downloaded under services/customizer/tests/tasks/file_io/data/temp which is in .gitignore ++ volumes: ++ - ../../../../../../tests/tasks/file_io/data:/var/run/scratch ++ ++ environment: ++ # NeMo Platform URLs - use host.docker.internal to reach host services ++ NMP_BASE_URL: ${NMP_BASE_URL:-http://host.docker.internal:8000} ++ NMP_FILES_URL: ${NMP_FILES_URL:-http://host.docker.internal:8000} ++ NMP_JOBS_URL: ${NMP_JOBS_URL:-http://host.docker.internal:8000} ++ ++ # Job configuration paths (container paths) ++ NEMO_JOB_STEP_CONFIG_FILE_PATH: /var/run/scratch/sample_config.json ++ NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH: /var/run/scratch ++ ++ # Job metadata ++ NEMO_JOB_ID: ${NEMO_JOB_ID:-multi-file-job} ++ NEMO_JOB_STEP: ${NEMO_JOB_STEP:-FileIO} ++ NEMO_JOB_TASK: ${NEMO_JOB_TASK:-file-io-task} ++ NEMO_JOB_WORKSPACE: ${NEMO_JOB_WORKSPACE:-default} ++ ++ # Logging ++ LOG_LEVEL: ${LOG_LEVEL:-INFO} ++ ++ # Default command - run the file_io task ++ # Note: ENTRYPOINT is already "nemo-platform", so command should NOT include it ++ command: ["run", "task", "--task", "nmp.automodel.tasks.file_io"] ++ ++ # For macOS/Windows Docker Desktop - allows reaching host services ++ extra_hosts: ++ - "host.docker.internal:host-gateway" +diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/__init__.py b/services/automodel/src/nmp/automodel/tasks/file_io/__init__.py +new file mode 100644 +index 000000000..8df0acb9e +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/file_io/__init__.py +@@ -0,0 +1,8 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""File I/O task for Automodel customization jobs.""" ++ ++from nmp.automodel.tasks.file_io.run import run ++ ++__all__ = ["run"] +diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/__main__.py b/services/automodel/src/nmp/automodel/tasks/file_io/__main__.py +new file mode 100644 +index 000000000..68981865e +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/file_io/__main__.py +@@ -0,0 +1,9 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import sys ++ ++from nmp.automodel.tasks.file_io.run import run ++ ++if __name__ == "__main__": ++ sys.exit(run()) +diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/callbacks.py b/services/automodel/src/nmp/automodel/tasks/file_io/callbacks.py +new file mode 100644 +index 000000000..df7ef199e +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/file_io/callbacks.py +@@ -0,0 +1,783 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Custom fsspec callbacks for progress reporting during file I/O operations.""" ++ ++import logging ++import os ++import threading ++from abc import abstractmethod ++from dataclasses import dataclass ++from pathlib import Path ++from typing import Any ++ ++from fsspec.callbacks import Callback, TqdmCallback ++from nmp.common.jobs.schemas import PlatformJobStatus ++from nmp.automodel.app.jobs.file_io.schemas import DownloadStats, TaskPhase, UploadStats ++from nmp.automodel.tasks.file_io.progress_reporter import ProgressReporter ++ ++logger = logging.getLogger(__name__) ++ ++ ++def get_percentage(current: int, total: int) -> int: ++ """Get percentage of current / total. ++ ++ Args: ++ current: The current value (numerator). ++ total: The total value (denominator). ++ ++ Returns: ++ Integer percentage from 0-100. ++ ++ Raises: ++ ValueError: If current > total, or if either value is negative. ++ ++ """ ++ if current > total: ++ raise ValueError( ++ f"Unexpected value of the current and total values: current={current} cannot be greater than total={total}", ++ ) ++ if total < 0: ++ raise ValueError(f"Unexpected negative value of the total value: total={total}, current={current}") ++ if current < 0: ++ raise ValueError(f"Unexpected negative value of the current value: current={current}, total={total}") ++ ++ if total == 0: ++ return 0 ++ return int((current / total) * 100) ++ ++ ++@dataclass ++class FileInfo: ++ """A dataclass for file information.""" ++ ++ path: str ++ size: int ++ ++ ++class TqdmPerFileUploadCallback(Callback): ++ """A callback that creates a separate tqdm progress bar for each file. ++ ++ Unlike TqdmCallback which shows overall progress, this callback creates a new ++ tqdm progress bar when branched() is called for each file. Each file's progress ++ bar shows byte-level progress for that individual file. ++ ++ Usage: ++ callback = TqdmPerFileUploadCallback() ++ filesystem_sdk.put(src, dest, recursive=True, callback=callback) ++ # Creates a separate progress bar for each file being uploaded ++ """ ++ ++ def __init__(self, src_path: Path, **kwargs: Any): ++ """Initialize the per-file tqdm callback. ++ ++ Args: ++ **kwargs: Additional arguments passed to the base Callback. ++ ++ """ ++ self.src_path = src_path ++ super().__init__(**kwargs) ++ ++ def branched(self, full_src_path: str, full_dest_path: str, **kwargs: Any) -> TqdmCallback: ++ """Create a TqdmCallback for this specific file transfer. ++ ++ Args: ++ full_src_path: Source file path. ++ full_dest_path: Destination file path. ++ **kwargs: Additional keyword arguments. ++ ++ Returns: ++ A TqdmCallback configured for byte-level progress of this file. ++ ++ """ ++ # Extract just the filename for the progress bar description ++ if self.src_path.is_file(): ++ relative_path_upload_dir = self.src_path.name ++ else: ++ relative_path_upload_dir = Path(full_src_path).relative_to(self.src_path) ++ return TqdmCallback( ++ # https://tqdm.github.io/docs/tqdm ++ tqdm_kwargs={ ++ "desc": f"Uploading {relative_path_upload_dir!s}", ++ # use bytes as the unit ++ "unit": "B", ++ # scale the unit to be more readable (e.g. 1024 bytes = 1 KB) ++ "unit_scale": True, ++ # divide the unit by 1024 to get the next unit ++ "unit_divisor": 1024, ++ # The minimum number of iterations (bytes processed) that must occur before the progress bar refreshes ++ "miniters": 1, ++ }, ++ ) ++ ++ ++class TqdmPerFileDownloadCallback(Callback): ++ """A callback that creates a separate tqdm progress bar for each file download. ++ ++ Similar to TqdmPerFileUploadCallback but for download operations. Creates a new ++ tqdm progress bar when branched() is called for each file being downloaded. ++ ++ The callback accepts a file_sizes dict to set the total size for each file's ++ progress bar. This is necessary because the SDK may not receive Content-Length ++ headers for streaming downloads (e.g., when chunked transfer encoding is used). ++ ++ Usage: ++ # Build file_sizes from listing ++ files = list_fileset_files(fileset) ++ file_sizes = {f.path.lstrip("/"): f.size for f in files} ++ ++ callback = TqdmPerFileDownloadCallback( ++ dest_path=dest_dir, ++ fileset_path="workspace/fileset", ++ file_sizes=file_sizes, ++ ) ++ filesystem_sdk.get(src, dest, recursive=True, callback=callback) ++ # Creates a separate progress bar for each file being downloaded ++ """ ++ ++ def __init__(self, dest_path: Path, fileset_path: str, file_sizes: dict[str, int] | None = None, **kwargs: Any): ++ """Initialize the per-file tqdm download callback. ++ ++ Args: ++ dest_path: The local destination directory path. ++ fileset_path: The fileset path (e.g., "workspace/fileset") used to extract ++ relative file paths from full source paths. ++ file_sizes: Optional dict mapping relative file paths to their sizes in bytes. ++ Used to set the progress bar's total for percentage display. ++ **kwargs: Additional arguments passed to the base Callback. ++ ++ """ ++ self.dest_path = dest_path ++ self.fileset_path = fileset_path.rstrip("/") ++ self.file_sizes = file_sizes or {} ++ super().__init__(**kwargs) ++ ++ def branched(self, full_src_path: str, full_dest_path: str, **kwargs: Any) -> TqdmCallback: ++ """Create a TqdmCallback for this specific file download. ++ ++ Args: ++ full_src_path: Source file path in the fileset (e.g., "workspace/fileset/dir/file.txt"). ++ full_dest_path: Destination local file path. ++ **kwargs: Additional keyword arguments. ++ ++ Returns: ++ A TqdmCallback configured for byte-level progress of this file. ++ ++ """ ++ # Extract relative path for the progress bar description ++ # full_dest_path is the full local path, we want to show just the filename or relative path ++ dest_full_path = Path(full_dest_path) ++ if self.dest_path.is_file(): ++ relative_path = dest_full_path.name ++ else: ++ try: ++ relative_path = dest_full_path.relative_to(self.dest_path) ++ except ValueError: ++ # If can't compute relative path, use filename ++ relative_path = dest_full_path.name ++ ++ # Extract relative file path from full source path to look up size ++ # full_src_path format: "workspace/fileset/relative/path/to/file.txt" ++ # We need to extract "relative/path/to/file.txt" ++ relative_file_path = full_src_path ++ if full_src_path.startswith(self.fileset_path): ++ relative_file_path = full_src_path[len(self.fileset_path) :].lstrip("/") ++ ++ # Look up file size from pre-computed mapping ++ file_size = self.file_sizes.get(relative_file_path) ++ ++ callback = TqdmCallback( ++ tqdm_kwargs={ ++ "desc": f"Downloading {relative_path!s}", ++ "unit": "B", ++ "unit_scale": True, ++ "unit_divisor": 1024, ++ "miniters": 1, ++ }, ++ ) ++ ++ # Set size if we know it - this enables percentage display in tqdm ++ # Must be called via set_size() rather than tqdm_kwargs["total"] because ++ # the SDK may also call set_size() from Content-Length header ++ if file_size is not None: ++ callback.set_size(file_size) ++ ++ return callback ++ ++ ++class BaseProgressCallback(Callback): ++ """Base class for file upload/download progress callbacks. ++ ++ This abstract base class provides common functionality for tracking file transfer ++ progress and reporting to the Jobs service. Subclasses implement operation-specific ++ behavior (upload vs download). ++ ++ Thread Safety: ++ This callback uses threading.Lock for synchronization. FilesetFileSystem is ++ async-first and transfers files concurrently. The lock protects against ++ concurrent access when multiple files complete simultaneously. ++ ++ Attributes: ++ progress_reporter: The progress reporter for sending updates to Jobs service. ++ fileset_name: The name of the fileset (workspace/name format). ++ total_files: Total number of files to transfer. ++ total_size: Total size of all files in bytes. ++ stats: Mutable stats object to track progress (UploadStats or DownloadStats). ++ _lock: Threading lock for thread-safe stats updates. ++ ++ """ ++ ++ progress_reporter: ProgressReporter ++ fileset_name: str ++ total_files: int ++ total_size: int ++ stats: UploadStats | DownloadStats ++ _lock: threading.Lock ++ ++ def __init__( ++ self, ++ progress_reporter: ProgressReporter, ++ fileset_name: str, ++ total_files: int, ++ total_size: int, ++ stats: UploadStats | DownloadStats, ++ **kwargs: Any, ++ ): ++ """Initialize the progress callback. ++ ++ Args: ++ progress_reporter: The progress reporter for sending updates to Jobs service. ++ fileset_name: The name of the fileset (workspace/name format). ++ total_files: Total number of files to transfer. ++ total_size: Total size of all files in bytes. ++ stats: Mutable stats object to track progress. ++ **kwargs: Additional arguments passed to the base Callback. ++ ++ """ ++ super().__init__(**kwargs) ++ self.progress_reporter = progress_reporter ++ self.fileset_name = str(fileset_name) ++ self.total_files = total_files ++ self.total_size = total_size ++ self.stats = stats ++ self._lock = threading.Lock() ++ ++ @staticmethod ++ def list_local_files(src_path: Path) -> list[FileInfo]: ++ """List all files from a local path (file or directory). ++ ++ If src_path is a file, returns a single FileInfo with the filename. ++ If src_path is a directory, recursively lists all files. ++ ++ Returns list of FileInfo objects with 'path' (relative path) and 'size' keys. ++ This mirrors the format returned by list_fileset_files. ++ """ ++ if not src_path.exists(): ++ logger.warning(f"Failed to list local files. Source path does not exist: {src_path}") ++ return [] ++ ++ try: ++ # Handle single file ++ if src_path.is_file(): ++ logger.info(f"Found 1 file: {src_path.name}") ++ return [ ++ FileInfo( ++ path=src_path.name, ++ size=src_path.stat().st_size, ++ ), ++ ] ++ ++ # Handle directory ++ files = [] ++ for root, _, filenames in os.walk(src_path): ++ for filename in filenames: ++ full_path = Path(root) / filename ++ relative_path = full_path.relative_to(src_path) ++ files.append( ++ FileInfo( ++ path=str(relative_path), ++ size=full_path.stat().st_size, ++ ), ++ ) ++ logger.info(f"Found {len(files)} files in {src_path}") ++ return files ++ except Exception as e: ++ logger.warning(f"Failed to list local files. Source path: {src_path}. Error: {e}") ++ return [] ++ ++ @abstractmethod ++ def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "BaseSingleFileCallback": ++ """Create a child callback for a single file transfer. ++ ++ Args: ++ source_path: Source file path. ++ dest_path: Destination file path. ++ **kwargs: Additional keyword arguments. ++ ++ Returns: ++ A BaseSingleFileCallback subclass for tracking this file's transfer. ++ ++ """ ++ ... ++ ++ ++class BaseSingleFileCallback(Callback): ++ """Base class for single file upload/download callbacks. ++ ++ This abstract base class provides common functionality for tracking individual ++ file transfers within a batch operation. Subclasses implement operation-specific ++ behavior via the template method pattern. ++ ++ The close() method uses the template method pattern, calling abstract methods ++ that subclasses override to provide operation-specific behavior: ++ - _get_phase(): Returns the TaskPhase for this operation ++ - _get_file_display_path(): Returns the path to display for logging ++ - _update_stats(): Updates the parent's stats for this operation ++ - _build_status_details(): Builds the status_details dict for progress reporting ++ """ ++ ++ parent: BaseProgressCallback ++ source_path: str ++ dest_path: str ++ _completed: bool ++ ++ def __init__( ++ self, ++ parent: BaseProgressCallback, ++ source_path: str, ++ dest_path: str, ++ **kwargs: Any, ++ ): ++ """Initialize the single file callback. ++ ++ Args: ++ parent: The parent progress callback. ++ source_path: Path to the source file. ++ dest_path: Destination path for the file. ++ **kwargs: Additional arguments passed to the base Callback. ++ ++ """ ++ super().__init__(**kwargs) ++ self.parent = parent ++ self.source_path = source_path ++ self.dest_path = dest_path ++ self._completed = False ++ ++ @abstractmethod ++ def _get_phase(self) -> str: ++ """Return the TaskPhase for this operation.""" ++ ... ++ ++ @abstractmethod ++ def _get_file_display_path(self) -> str: ++ """Return the path to use for display/logging.""" ++ ... ++ ++ @abstractmethod ++ def _update_stats(self) -> None: ++ """Update the parent's stats for this operation (called within lock).""" ++ ... ++ ++ @abstractmethod ++ def _get_files_count(self) -> int: ++ """Return the current files count from stats (called within lock).""" ++ ... ++ ++ @abstractmethod ++ def _build_status_details(self, files_count: int, total_bytes: int, current_file: str) -> dict[str, Any]: ++ """Build the status_details dict for progress reporting. ++ ++ Args: ++ files_count: Number of files transferred so far. ++ total_bytes: Total bytes transferred so far. ++ current_file: Name of the current file for display. ++ ++ Returns: ++ Dictionary with status details for the progress report. ++ ++ """ ++ ... ++ ++ def close(self) -> None: ++ """Called when the file transfer completes. ++ ++ Updates the parent's statistics and reports progress to the Jobs service. ++ Thread-safe: uses parent's lock to protect stats updates. ++ """ ++ if self._completed: ++ return ++ ++ self._completed = True ++ parent = self.parent ++ ++ # Extract the filename for logging/display ++ current_file = self._get_file_display_path() ++ ++ # Thread-safe stats update ++ with parent._lock: ++ # Update stats (operation-specific) ++ self._update_stats() ++ ++ # Capture current values while holding the lock ++ files_count = self._get_files_count() ++ total_bytes = parent.stats.total_bytes ++ ++ logger.debug(f"File transferred: {current_file} ({files_count}/{parent.total_files})") ++ ++ # Report progress to Jobs service (outside lock to avoid holding it during I/O) ++ parent.progress_reporter.update_progress( ++ status=PlatformJobStatus.ACTIVE, ++ status_details=self._build_status_details(files_count, total_bytes, current_file), ++ ) ++ ++ def __enter__(self) -> "BaseSingleFileCallback": ++ return self ++ ++ def __exit__(self, *exc_args: object) -> None: ++ self.close() ++ ++ ++class FileUploadProgressCallback(BaseProgressCallback): ++ """Callback for tracking file upload progress and reporting to the Jobs service. ++ ++ This callback integrates with fsspec's callback mechanism to report progress ++ after each file is uploaded. It uses the branched callback pattern where: ++ - The parent callback tracks overall upload statistics ++ - Child callbacks are created for each file via `branched()` ++ - When a child callback closes, it signals file completion to the parent ++ ++ Usage: ++ callback = FileUploadProgressCallback( ++ progress_reporter=reporter, ++ src_path=src_path, ++ fileset_name="workspace/fileset", ++ stats=upload_stats, ++ ) ++ filesystem_sdk.put(src, dest, recursive=True, callback=callback) ++ """ ++ ++ stats: UploadStats ++ ++ def __init__( ++ self, ++ progress_reporter: ProgressReporter, ++ src_path: Path, ++ fileset_name: str, ++ stats: UploadStats, ++ **kwargs: Any, ++ ): ++ """Initialize the upload progress callback. ++ ++ Args: ++ progress_reporter: The progress reporter for sending updates to Jobs service. ++ src_path: The source path (file or directory) to upload. ++ fileset_name: The name of the target fileset (workspace/name format). ++ stats: Mutable UploadStats object to track progress. ++ **kwargs: Additional arguments passed to the base Callback. ++ ++ """ ++ # List files to get stats before upload ++ files = self.list_local_files(src_path) ++ ++ if not files: ++ logger.warning(f"Source path {src_path} contains no files") ++ ++ total_files = len(files) ++ total_size = sum(f.size for f in files) ++ ++ # Initialize base class with computed values ++ super().__init__( ++ progress_reporter=progress_reporter, ++ fileset_name=fileset_name, ++ total_files=total_files, ++ total_size=total_size, ++ stats=stats, ++ **kwargs, ++ ) ++ ++ logger.info(f"Uploading {total_files} files ({total_size} bytes) to {self.fileset_name}") ++ ++ # Report initial progress ++ progress_reporter.update_progress( ++ status=PlatformJobStatus.ACTIVE, ++ status_details={ ++ "phase": TaskPhase.UPLOADING, ++ "fileset": self.fileset_name, ++ "total_files": total_files, ++ "total_size": total_size, ++ "uploaded_files": 0, ++ "uploaded_bytes": 0, ++ }, ++ ) ++ ++ def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "SingleFileUploadCallback": ++ """Create a child callback for a single file upload. ++ ++ This method is called by fsspec when starting a file transfer within ++ a recursive put operation. It returns a child callback that tracks ++ the individual file's progress and reports completion to the parent. ++ ++ Args: ++ source_path: Source file path. ++ path_2: Destination file path. ++ **kwargs: Additional keyword arguments. ++ ++ Returns: ++ A SingleFileUploadCallback for tracking this file's upload. ++ ++ """ ++ return SingleFileUploadCallback( ++ parent=self, ++ source_path=source_path, ++ dest_path=dest_path, ++ **kwargs, ++ ) ++ ++ ++class SingleFileUploadCallback(BaseSingleFileCallback): ++ """Callback for tracking a single file upload within a batch operation. ++ ++ This child callback is created by FileUploadProgressCallback.branched() ++ for each file being uploaded. When the upload completes and this callback ++ is closed, it notifies the parent to update overall progress. ++ """ ++ ++ parent: FileUploadProgressCallback ++ ++ def _get_phase(self) -> str: ++ """Return the TaskPhase for upload operations.""" ++ return TaskPhase.UPLOADING ++ ++ def _get_file_display_path(self) -> str: ++ """Return the destination filename for display.""" ++ return self.dest_path.split("/")[-1] if "/" in self.dest_path else self.dest_path ++ ++ def _update_stats(self) -> None: ++ """Update the parent's upload stats.""" ++ self.parent.stats.files_uploaded += 1 ++ if self.size is not None: ++ self.parent.stats.total_bytes += self.size ++ ++ def _get_files_count(self) -> int: ++ """Return the current uploaded files count.""" ++ return self.parent.stats.files_uploaded ++ ++ def _build_status_details(self, files_count: int, total_bytes: int, current_file: str) -> dict[str, Any]: ++ """Build the status_details dict for upload progress reporting.""" ++ return { ++ "phase": TaskPhase.UPLOADING, ++ "fileset": self.parent.fileset_name, ++ "total_files": self.parent.total_files, ++ "total_size": self.parent.total_size, ++ "uploaded_files": files_count, ++ "uploaded_bytes": total_bytes, ++ "current_file": current_file, ++ "progress_pct": get_percentage(files_count, self.parent.total_files), ++ } ++ ++ ++class FileDownloadProgressCallback(BaseProgressCallback): ++ """Callback for tracking file download progress and reporting to the Jobs service. ++ ++ Similar to FileUploadProgressCallback but for download operations. ++ ++ Usage: ++ callback = FileDownloadProgressCallback( ++ progress_reporter=reporter, ++ fileset_name="workspace/fileset", ++ total_files=10, ++ total_size=1024000, ++ stats=download_stats, ++ ) ++ filesystem_sdk.get(src, dest, recursive=True, callback=callback) ++ """ ++ ++ stats: DownloadStats ++ ++ def __init__( ++ self, ++ progress_reporter: ProgressReporter, ++ fileset_name: str, ++ total_files: int, ++ total_size: int, ++ stats: DownloadStats, ++ **kwargs: Any, ++ ): ++ """Initialize the download progress callback. ++ ++ Args: ++ progress_reporter: The progress reporter for sending updates to Jobs service. ++ fileset_name: The name of the source fileset (workspace/name format). ++ total_files: Total number of files to download. ++ total_size: Total size of all files in bytes. ++ stats: Mutable DownloadStats object to track progress. ++ **kwargs: Additional arguments passed to the base Callback. ++ ++ """ ++ super().__init__( ++ progress_reporter=progress_reporter, ++ fileset_name=fileset_name, ++ total_files=total_files, ++ total_size=total_size, ++ stats=stats, ++ **kwargs, ++ ) ++ ++ logger.info(f"Downloading {total_files} files ({total_size} bytes) from {self.fileset_name}") ++ ++ # Report initial progress ++ progress_reporter.update_progress( ++ status=PlatformJobStatus.ACTIVE, ++ status_details={ ++ "phase": TaskPhase.DOWNLOADING, ++ "fileset": self.fileset_name, ++ "total_files": total_files, ++ "total_size": total_size, ++ "downloaded_files": 0, ++ "downloaded_bytes": 0, ++ }, ++ ) ++ ++ def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "SingleFileDownloadCallback": ++ """Create a child callback for a single file download. ++ ++ Args: ++ source_path: Source file path in the fileset. ++ dest_path: Destination local file path. ++ **kwargs: Additional keyword arguments. ++ ++ Returns: ++ A SingleFileDownloadCallback for tracking this file's download. ++ ++ """ ++ return SingleFileDownloadCallback( ++ parent=self, ++ source_path=source_path, ++ dest_path=dest_path, ++ **kwargs, ++ ) ++ ++ ++class SingleFileDownloadCallback(BaseSingleFileCallback): ++ """Callback for tracking a single file download within a batch operation. ++ ++ This child callback is created by FileDownloadProgressCallback.branched() ++ for each file being downloaded. When the download completes and this callback ++ is closed, it notifies the parent to update overall progress. ++ """ ++ ++ parent: FileDownloadProgressCallback ++ ++ def _get_phase(self) -> str: ++ """Return the TaskPhase for download operations.""" ++ return TaskPhase.DOWNLOADING ++ ++ def _get_file_display_path(self) -> str: ++ """Return the source filename for display.""" ++ return self.source_path.split("/")[-1] if "/" in self.source_path else self.source_path ++ ++ def _update_stats(self) -> None: ++ """Update the parent's download stats.""" ++ self.parent.stats.files_downloaded += 1 ++ if self.size is not None: ++ self.parent.stats.total_bytes += self.size ++ ++ def _get_files_count(self) -> int: ++ """Return the current downloaded files count.""" ++ return self.parent.stats.files_downloaded ++ ++ def _build_status_details(self, files_count: int, total_bytes: int, current_file: str) -> dict[str, Any]: ++ """Build the status_details dict for download progress reporting.""" ++ return { ++ "phase": TaskPhase.DOWNLOADING, ++ "fileset": self.parent.fileset_name, ++ "total_files": self.parent.total_files, ++ "total_size": self.parent.total_size, ++ "downloaded_files": files_count, ++ "downloaded_bytes": total_bytes, ++ "current_file": current_file, ++ "progress_pct": get_percentage(files_count, self.parent.total_files), ++ } ++ ++ ++class CompositeCallback(Callback): ++ """A callback that delegates to multiple child callbacks. ++ ++ This allows combining multiple callbacks (e.g., TqdmCallback for console progress ++ and FileUploadProgressCallback for Jobs service reporting) into a single callback ++ that can be passed to fsspec operations. ++ ++ All callback methods are forwarded to each child callback in order. ++ ++ Usage: ++ tqdm_cb = TqdmCallback(tqdm_kwargs={"desc": "Uploading"}) ++ progress_cb = FileUploadProgressCallback(...) ++ composite = CompositeCallback(tqdm_cb, progress_cb) ++ filesystem_sdk.put(src, dest, recursive=True, callback=composite) ++ """ ++ ++ def __init__(self, *callbacks: Callback, **kwargs: Any): ++ """Initialize with multiple callbacks. ++ ++ Args: ++ *callbacks: Variable number of Callback instances to delegate to. ++ **kwargs: Additional arguments passed to the base Callback. ++ ++ """ ++ super().__init__(**kwargs) ++ self.callbacks = list(callbacks) ++ ++ def set_size(self, size: int) -> None: ++ """Set size on all child callbacks.""" ++ self.size = size ++ for cb in self.callbacks: ++ cb.set_size(size) ++ ++ def absolute_update(self, value: int) -> None: ++ """Update absolute value on all child callbacks.""" ++ self.value = value ++ for cb in self.callbacks: ++ cb.absolute_update(value) ++ ++ def relative_update(self, inc: int = 1) -> None: ++ """Update relative value on all child callbacks.""" ++ self.value += inc ++ for cb in self.callbacks: ++ cb.relative_update(inc) ++ ++ def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "CompositeCallback": ++ """Create a composite child callback from all child callbacks' branched results. ++ ++ Each child callback's branched() method is called, and the results are ++ wrapped in a new CompositeCallback. ++ ++ Args: ++ source_path: Source path. ++ dest_path: Destination path. ++ **kwargs: Additional keyword arguments. ++ ++ Returns: ++ A new CompositeCallback wrapping all child callbacks' branched results. ++ ++ """ ++ child_callbacks = [cb.branched(source_path, dest_path, **kwargs) for cb in self.callbacks] ++ return CompositeCallback(*child_callbacks) ++ ++ def call(self, hook_name: str | None = None, **kwargs: Any) -> None: ++ """Call hooks on all child callbacks.""" ++ for cb in self.callbacks: ++ cb.call(hook_name, **kwargs) ++ ++ def close(self) -> None: ++ """Close all child callbacks.""" ++ for cb in self.callbacks: ++ cb.close() ++ ++ def __enter__(self) -> "CompositeCallback": ++ for cb in self.callbacks: ++ cb.__enter__() ++ return self ++ ++ def __exit__(self, *exc_args: object) -> None: ++ for cb in self.callbacks: ++ cb.__exit__(*exc_args) +diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/progress_reporter.py b/services/automodel/src/nmp/automodel/tasks/file_io/progress_reporter.py +new file mode 100644 +index 000000000..00fa66011 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/file_io/progress_reporter.py +@@ -0,0 +1,93 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import logging ++from typing import Any, Protocol ++ ++from nemo_platform import NeMoPlatform, omit ++from nemo_platform._exceptions import APIError ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nmp.automodel.app.jobs.file_io.schemas import ProgressReportError ++from nmp.automodel.tasks.file_io.utils import sdk_error_handler ++from nmp.common.jobs.schemas import PlatformJobStatus ++ ++logger = logging.getLogger(__name__) ++ ++ ++class ProgressReporter(Protocol): ++ """Interface for reporting task progress.""" ++ ++ def update_progress( ++ self, ++ status: PlatformJobStatus, ++ status_details: dict[str, Any] | None = None, ++ error_details: dict[str, Any] | None = None, ++ error_stack: str | None = None, ++ ) -> None: ++ """Update task progress.""" ++ ... ++ ++ ++class NoOpProgressReporter: ++ """Progress reporter that does nothing. Used when Jobs service is not configured.""" ++ ++ def update_progress( ++ self, ++ status: PlatformJobStatus, ++ status_details: dict[str, Any] | None = None, ++ error_details: dict[str, Any] | None = None, ++ error_stack: str | None = None, ++ ) -> None: ++ """No-op: silently ignore progress updates.""" ++ ++ ++class JobsServiceProgressReporter: ++ """Reports progress to the Jobs service via SDK.""" ++ ++ def __init__(self, sdk: NeMoPlatform, workspace: str, job_id: str, step_name: str, task_id: str): ++ self.sdk = sdk ++ self.workspace = workspace ++ self.job_id = job_id ++ self.step_name = step_name ++ self.task_id = task_id ++ ++ def update_progress( ++ self, ++ status: PlatformJobStatus, ++ status_details: dict[str, object] | None = None, ++ error_details: dict[str, object] | None = None, ++ error_stack: str | None = None, ++ ) -> None: ++ """Update task progress via SDK.""" ++ try: ++ with sdk_error_handler( ++ ProgressReportError, ++ f"update progress for task: {self.task_id}, job: {self.job_id}, step: {self.step_name}", ++ passthrough=(APIError,), ++ ): ++ self.sdk.jobs.tasks.create_or_update( ++ self.task_id, ++ workspace=self.workspace, ++ job=self.job_id, ++ step=self.step_name, ++ status=status.value, ++ status_details=status_details if status_details else omit, ++ error_details=error_details if error_details else omit, ++ error_stack=error_stack if error_stack else omit, ++ ) ++ logger.debug(f"Progress updated: {status} - {status_details}") ++ except Exception as e: ++ logger.warning( ++ f"Failed to report progress for task {self.task_id}, job {self.job_id}, step {self.step_name}: {e}", ++ ) ++ ++ @staticmethod ++ def create_progress_reporter(sdk: NeMoPlatform, job_ctx: NMPJobContext) -> ProgressReporter: ++ """Create JobsServiceProgressReporter when jobs_url is set, else NoOpProgressReporter.""" ++ if job_ctx.jobs_url: ++ logger.info(f"Progress reporting enabled: {job_ctx.jobs_url}") ++ return JobsServiceProgressReporter( ++ sdk, job_ctx.workspace, job_ctx.job_id, job_ctx.step, job_ctx.normalized_task ++ ) ++ logger.info("Progress reporting disabled: jobs_url not configured") ++ return NoOpProgressReporter() +diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/run.py b/services/automodel/src/nmp/automodel/tasks/file_io/run.py +new file mode 100644 +index 000000000..f14acd7b2 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/file_io/run.py +@@ -0,0 +1,560 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""File I/O task entry point. ++ ++Handles file operations between NeMo Platform Files Service and the job's shared PVC. ++ ++The task reads configuration and performs: ++- Downloads: If config.download is non-empty, download files from FileSets to local paths ++- Uploads: If config.upload is non-empty, upload files from local paths to FileSets ++ ++Usage: ++ export NEMO_JOB_STEP_CONFIG_FILE_PATH= ++ python -m nmp.automodel.tasks.file_io ++""" ++ ++import logging ++from pathlib import Path ++ ++import httpx ++ ++# https://docs.nvidia.com/nemo/microservices/latest/pysdk/index.html#handling-errors ++from nemo_platform import ( ++ APIConnectionError, ++ APITimeoutError, ++ ConflictError, ++ InternalServerError, ++ NeMoPlatform, ++ NotFoundError, ++) ++from nemo_platform.types.files.fileset_file import FilesetFile ++from nmp.automodel.app.constants import SERVICE_NAME ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nmp.automodel.app.jobs.file_io.schemas import ( ++ DownloadItem, ++ DownloadStats, ++ FileDownloadError, ++ FileSetRef, ++ FileUploadError, ++ PathTraversalError, ++ TaskPhase, ++ UploadItem, ++ UploadStats, ++) ++from nmp.automodel.tasks.file_io.callbacks import ( ++ CompositeCallback, ++ FileDownloadProgressCallback, ++ FileUploadProgressCallback, ++ TqdmPerFileDownloadCallback, ++ TqdmPerFileUploadCallback, ++) ++from nmp.automodel.tasks.file_io.progress_reporter import JobsServiceProgressReporter, ProgressReporter ++from nmp.automodel.tasks.file_io.utils import ( ++ filesystem_sdk_error_handler, ++ get_config, ++ sdk_error_handler, ++ validate_safe_path, ++ validate_storage_path, ++) ++from nmp.common.jobs.schemas import PlatformJobStatus ++from nmp.common.sdk_factory import get_task_sdk ++from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_exponential ++ ++logger = logging.getLogger(__name__) ++ ++# Timeout configurations for SDK operations (httpx.Timeout for API calls) ++CREATE_FILESET_TIMEOUT = httpx.Timeout(10.0, connect=10.0) ++LIST_FILES_TIMEOUT = httpx.Timeout(10.0, connect=10.0) ++ ++# Timeout configurations for FilesetFileSystem operations. ++# These are passed via sdk.with_options(timeout=...) and control the httpx client. ++# httpx.Timeout(read=...) is the max wait for a single chunk (16MB by default), NOT total transfer time. ++# nemo-platform/src/nemo_platform/filesets/filesystem/filesystem.py > blocksize = 16 * 1024 * 1024 # 16MB ++# It's a socket-level timeout. Each individual socket read has its own timeout window. ++# SDK defaults httpx.Timeout(timeout=60, connect=5.0) nemo-platform/src/nemo_platform/_constants.py ++DOWNLOAD_TIMEOUT = httpx.Timeout(30.0, read=5 * 60) # 30s connect/pool, 5min per-chunk read ++UPLOAD_TIMEOUT = httpx.Timeout(30.0, write=10 * 60, read=5 * 60) # 30s connect/pool, 10min write, 5min read ++ ++# Retry configuration ++MAX_RETRIES = 3 ++INITIAL_BACKOFF_SECONDS = 1.0 ++MAX_BACKOFF_SECONDS = 30.0 ++ ++# Transient exceptions that should trigger retries for filesystem operations. ++# FilesetFileSystem uses httpx under the hood, so we retry on httpx transient errors ++# in addition to SDK-level transient errors. ++TRANSIENT_FILESYSTEM_EXCEPTIONS = ( ++ httpx.TimeoutException, ++ httpx.ConnectError, ++ httpx.ReadTimeout, ++) ++ ++ ++class FileIORunner: ++ def __init__( ++ self, ++ sdk: NeMoPlatform, ++ progress_reporter: ProgressReporter, ++ job_ctx: NMPJobContext, ++ ): ++ self.sdk = sdk ++ self.progress_reporter = progress_reporter ++ self.job_ctx = job_ctx ++ ++ def list_fileset_files( ++ self, ++ fileset: FileSetRef, ++ ) -> list[FilesetFile]: ++ """List files in a FileSet. ++ ++ Returns list of file info dicts with 'path' and 'size' keys. ++ """ ++ try: ++ with sdk_error_handler(FileDownloadError, f"list files in fileset {fileset}", passthrough=(NotFoundError,)): ++ response = self.sdk.with_options(timeout=LIST_FILES_TIMEOUT).files.list( ++ fileset=fileset.name, ++ workspace=fileset.workspace, ++ ) ++ logger.info(f"Found {len(response.data)} files in FileSet {fileset!s}") ++ return response.data ++ except NotFoundError as e: ++ raise FileDownloadError( ++ f"FileSet {fileset!s} not found. Please ensure the FileSet exists and contains the expected files.", ++ ) from e ++ ++ def download_fileset( ++ self, ++ fileset: FileSetRef, ++ dest_dir: Path, ++ ) -> DownloadStats: ++ """Download all files from a FileSet to a destination directory. ++ ++ Uses FilesetFileSystem.get() with recursive=True for efficient batch downloads. ++ Progress is tracked via two callbacks combined in a CompositeCallback: ++ - TqdmPerFileDownloadCallback: Creates a separate console progress bar per file (shows bytes) ++ - FileDownloadProgressCallback: Reports progress to Jobs service after each file ++ ++ Args: ++ fileset: The source FileSet reference. ++ dest_dir: The destination directory path. ++ ++ Returns: ++ DownloadStats with files_downloaded, total_bytes, and failed_files counts. ++ ++ Raises: ++ FileDownloadError: If the download fails. ++ ++ """ ++ stats = DownloadStats() ++ fileset_name = str(fileset) ++ ++ # List files in the fileset to get total count and size ++ files = self.list_fileset_files(fileset) ++ ++ if not files: ++ logger.warning(f"FileSet {fileset_name} contains no files") ++ return stats ++ ++ total_files = len(files) ++ total_size = sum(f.size for f in files) ++ ++ # Ensure destination directory exists ++ dest_dir.mkdir(parents=True, exist_ok=True) ++ ++ # Build file sizes mapping for progress bar display ++ # Maps relative file paths to their sizes in bytes ++ file_sizes = {f.path.lstrip("/"): f.size for f in files} ++ ++ # Create callbacks: ++ # 1. TqdmPerFileDownloadCallback for console progress - creates a separate progress bar per file ++ tqdm_callback = TqdmPerFileDownloadCallback( ++ dest_path=dest_dir, ++ fileset_path=fileset_name, ++ file_sizes=file_sizes, ++ ) ++ ++ # 2. FileDownloadProgressCallback for Jobs service reporting ++ jobs_callback = FileDownloadProgressCallback( ++ progress_reporter=self.progress_reporter, ++ fileset_name=fileset_name, ++ total_files=total_files, ++ total_size=total_size, ++ stats=stats, ++ ) ++ ++ # Combine both callbacks into a composite that delegates to both ++ composite_callback = CompositeCallback(tqdm_callback, jobs_callback) ++ ++ with filesystem_sdk_error_handler( ++ FileDownloadError, ++ f"download from '{fileset_name}' to '{dest_dir}'", ++ ): ++ self._download_with_retry( ++ fileset_name=fileset.name, ++ fileset_workspace=fileset.workspace, ++ dest_dir=str(dest_dir), ++ callback=composite_callback, ++ ) ++ ++ logger.info(f"Download complete: {stats.files_downloaded} files, {stats.total_bytes} bytes") ++ return stats ++ ++ @retry( ++ stop=stop_after_attempt(MAX_RETRIES), ++ wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), ++ retry=retry_if_exception_type(TRANSIENT_FILESYSTEM_EXCEPTIONS), ++ reraise=True, ++ before_sleep=before_sleep_log(logger, logging.WARNING), ++ ) ++ def _download_with_retry( ++ self, ++ fileset_name: str, ++ fileset_workspace: str | None, ++ dest_dir: str, ++ callback: CompositeCallback, ++ ) -> None: ++ """Internal method with retry logic for downloading from FilesetFileSystem.""" ++ self.sdk.with_options(timeout=DOWNLOAD_TIMEOUT).files.download( ++ fileset=fileset_name, ++ workspace=fileset_workspace, ++ local_path=dest_dir, ++ callback=callback, # type: ignore[arg-type] ++ ) ++ ++ def upload_fileset( ++ self, ++ fileset: FileSetRef, ++ src_path: Path, ++ ) -> UploadStats: ++ """Upload all files from a source path (file or directory) to a FileSet. ++ ++ Uses FilesetFileSystem.put() with recursive=True for efficient batch uploads. ++ Progress is tracked via two callbacks combined in a CompositeCallback: ++ - TqdmPerFileCallback: Creates a separate console progress bar per file (shows bytes) ++ - FileUploadProgressCallback: Reports progress to Jobs service after each file ++ ++ Args: ++ fileset: The target FileSet reference. ++ src_path: The source path, can be a single file or a directory. ++ progress_reporter: Progress reporter for status updates. ++ ++ Returns: ++ UploadStats with files_uploaded, total_bytes, and failed_files counts. ++ ++ Raises: ++ FileUploadError: If the upload fails. ++ ++ """ ++ stats = UploadStats() ++ fileset_name = str(fileset) ++ ++ # Create callbacks: ++ # 1. TqdmPerFileCallback for console progress - creates a separate progress bar per file ++ tqdm_callback = TqdmPerFileUploadCallback(src_path=src_path) ++ ++ # 2. FileUploadProgressCallback for Jobs service reporting ++ jobs_callback = FileUploadProgressCallback( ++ progress_reporter=self.progress_reporter, ++ src_path=src_path, ++ fileset_name=fileset_name, ++ stats=stats, ++ ) ++ ++ # Combine both callbacks into a composite that delegates to both ++ composite_callback = CompositeCallback(tqdm_callback, jobs_callback) ++ ++ # Build local and remote paths for upload ++ # remote_path is relative within the fileset (e.g., "" for root, "filename" for single file) ++ if src_path.is_dir(): ++ # Add trailing slash to source to copy directory CONTENTS (not the directory itself) ++ # This follows rsync/scp convention: "dir/" copies contents, "dir" copies the directory ++ local_path = f"{src_path}/" ++ remote_path = "" # Upload to fileset root ++ else: ++ # Single file: upload to fileset root with same filename ++ local_path = str(src_path) ++ remote_path = src_path.name ++ ++ with filesystem_sdk_error_handler( ++ FileUploadError, ++ f"upload from '{src_path}' to '{fileset_name}'", ++ ): ++ self._upload_with_retry( ++ local_path=local_path, ++ remote_path=remote_path, ++ fileset_name=fileset.name, ++ fileset_workspace=fileset.workspace, ++ callback=composite_callback, ++ ) ++ ++ logger.info(f"Upload complete: {stats.files_uploaded} files, {stats.total_bytes} bytes") ++ return stats ++ ++ @retry( ++ stop=stop_after_attempt(MAX_RETRIES), ++ wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), ++ retry=retry_if_exception_type(TRANSIENT_FILESYSTEM_EXCEPTIONS), ++ reraise=True, ++ before_sleep=before_sleep_log(logger, logging.WARNING, exc_info=True), ++ ) ++ def _upload_with_retry( ++ self, ++ local_path: str, ++ remote_path: str, ++ fileset_name: str, ++ fileset_workspace: str | None, ++ callback: CompositeCallback, ++ ) -> None: ++ """Internal method with retry logic for uploading to FilesetFileSystem.""" ++ self.sdk.with_options(timeout=UPLOAD_TIMEOUT).files.upload( ++ local_path=local_path, ++ remote_path=remote_path, ++ fileset=fileset_name, ++ workspace=fileset_workspace, ++ callback=callback, # type: ignore[arg-type] ++ ) ++ ++ def run_download(self, downloads: list[DownloadItem]) -> None: ++ """Execute download operations. ++ ++ Downloads files from FileSets to job storage based on downloads list. ++ """ ++ if not downloads: ++ logger.info("No downloads configured, skipping download operation") ++ return ++ ++ storage_path = validate_storage_path(self.job_ctx.storage_path) ++ ++ logger.info(f"Starting download operation: {len(downloads)} fileset(s) to download") ++ ++ # Report task started ++ self.progress_reporter.update_progress( ++ status=PlatformJobStatus.ACTIVE, ++ status_details={ ++ "phase": TaskPhase.DOWNLOADING, ++ "total_filesets": len(downloads), ++ "completed_filesets": 0, ++ }, ++ ) ++ ++ total_stats = DownloadStats() ++ ++ for idx, item in enumerate(downloads): ++ fileset = item.src ++ # Validate destination path to prevent path traversal attacks ++ dest_dir = validate_safe_path(storage_path, item.dest) ++ ++ logger.info(f"[{idx + 1}/{len(downloads)}] Downloading from {fileset!s} to {dest_dir}") ++ ++ self.progress_reporter.update_progress( ++ status=PlatformJobStatus.ACTIVE, ++ status_details={ ++ "phase": TaskPhase.DOWNLOADING, ++ "total_filesets": len(downloads), ++ "completed_filesets": idx, ++ "current_fileset": f"{fileset!s}", ++ }, ++ ) ++ ++ stats = self.download_fileset( ++ fileset, ++ dest_dir, ++ ) ++ total_stats.files_downloaded += stats.files_downloaded ++ total_stats.total_bytes += stats.total_bytes ++ ++ logger.info(f"FileSet download complete: {stats.files_downloaded} files, {stats.total_bytes} bytes") ++ ++ logger.info( ++ f"All downloads complete: {total_stats.files_downloaded} files, {total_stats.total_bytes} bytes total", ++ ) ++ ++ def create_fileset(self, fileset: FileSetRef, metadata: dict | None = None) -> None: ++ """Create a FileSet. Skip if it already exists. ++ ++ Uses retry logic for transient errors and converts SDK exceptions to FileUploadError. ++ """ ++ # sdk_error_handler wraps the retry to convert exceptions after all retries exhaust ++ with sdk_error_handler(FileUploadError, f"create fileset {fileset}", passthrough=(ConflictError,)): ++ self._create_fileset_with_retry(fileset, metadata) ++ ++ # we don't use sdk retry because it would retry on ConflictError which is expected and would be wasteful ++ @retry( ++ stop=stop_after_attempt(MAX_RETRIES), ++ wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), ++ retry=retry_if_exception_type((InternalServerError, APITimeoutError, APIConnectionError)), ++ reraise=True, # means that the last exception will be re-raised after the last retry attempt ++ ) ++ def _create_fileset_with_retry(self, fileset: FileSetRef, metadata: dict | None = None) -> None: ++ """Internal method with retry logic for creating a FileSet.""" ++ try: ++ create_kwargs: dict = { ++ "workspace": fileset.workspace, ++ "name": fileset.name, ++ "timeout": CREATE_FILESET_TIMEOUT, ++ "custom_fields": {"service_source": "automodel"}, ++ } ++ if metadata is not None: ++ create_kwargs["metadata"] = metadata ++ result = self.sdk.with_options(max_retries=0).files.filesets.create(**create_kwargs) ++ logger.info(f"Created FileSet: {result.workspace}/{result.name}") ++ except ConflictError: ++ # Fileset already exists - patch metadata so tool_calling etc. are not lost ++ workspace = fileset.workspace or self.job_ctx.workspace ++ if metadata is not None: ++ try: ++ self.sdk.with_options(max_retries=0).files.filesets.update( ++ name=fileset.name, ++ workspace=workspace, ++ metadata=metadata, ++ timeout=CREATE_FILESET_TIMEOUT, ++ ) ++ logger.info(f"Patched existing FileSet metadata: {workspace}/{fileset.name}") ++ except Exception as e: ++ logger.warning( ++ f"Could not patch metadata on existing fileset {workspace}/{fileset.name}: {e}. " ++ "Upload will continue; model-spec may lack tool_calling/chat_template from source." ++ ) ++ ++ def run_upload(self, uploads: list[UploadItem]) -> None: ++ """Execute upload operations. ++ ++ Uploads files from job storage to FileSets based on uploads list. ++ ++ Args: ++ uploads: List of upload items to process. ++ """ ++ if not uploads: ++ logger.info("No uploads configured, skipping upload operation") ++ return ++ ++ storage_path = validate_storage_path(self.job_ctx.storage_path) ++ ++ logger.info(f"Starting upload operation: {len(uploads)} fileset(s) to upload") ++ ++ # Report task started ++ self.progress_reporter.update_progress( ++ status=PlatformJobStatus.ACTIVE, ++ status_details={ ++ "phase": TaskPhase.UPLOADING, ++ "total_filesets": len(uploads), ++ "completed_filesets": 0, ++ }, ++ ) ++ ++ total_stats = UploadStats() ++ ++ for idx, item in enumerate(uploads): ++ if item.dest.workspace is None: ++ item.dest.workspace = self.job_ctx.workspace ++ fileset = item.dest ++ # Validate source path to prevent path traversal attacks ++ src_path = validate_safe_path(storage_path, item.src) ++ if not src_path.exists(): ++ raise FileUploadError(f"Source path does not exist: {src_path}. Ensure the source path exists.") ++ if not src_path.is_dir() and not src_path.is_file(): ++ raise FileUploadError( ++ f"Source path is not a file or directory: {src_path}. Ensure the source path is a file or directory.", ++ ) ++ ++ logger.info(f"[{idx + 1}/{len(uploads)}] Uploading from {src_path} to {fileset!s}") ++ ++ self.progress_reporter.update_progress( ++ status=PlatformJobStatus.ACTIVE, ++ status_details={ ++ "phase": TaskPhase.UPLOADING, ++ "total_filesets": len(uploads), ++ "completed_filesets": idx, ++ "current_fileset": str(fileset), ++ }, ++ ) ++ ++ self.create_fileset(fileset, metadata=item.metadata) ++ ++ stats = self.upload_fileset( ++ fileset, ++ src_path, ++ ) ++ total_stats.files_uploaded += stats.files_uploaded ++ total_stats.total_bytes += stats.total_bytes ++ ++ logger.info(f"FileSet upload complete: {stats.files_uploaded} files, {stats.total_bytes} bytes") ++ ++ logger.info(f"All uploads complete: {total_stats.files_uploaded} files, {total_stats.total_bytes} bytes total") ++ ++ ++def run(sdk: NeMoPlatform | None = None, job_ctx: NMPJobContext | None = None) -> int: ++ """Execute the file I/O task. ++ ++ Processes downloads and uploads based on the configuration. ++ ++ Args: ++ sdk: Optional SDK instance for dependency injection (for testing). ++ If None, creates one via get_task_sdk(). ++ job_ctx: Optional job context for dependency injection (for testing). ++ If None, creates one via NMPJobContext.from_env(). ++ ++ Returns: ++ Exit code (0 for success, non-zero for failure). ++ ++ """ ++ job_ctx = job_ctx or NMPJobContext.from_env() ++ validate_storage_path(job_ctx.storage_path) ++ ++ sdk_owned = sdk is None ++ progress_reporter: ProgressReporter | None = None ++ try: ++ sdk = sdk or get_task_sdk(SERVICE_NAME) ++ # Initialize progress reporter (no-op if Jobs URL not configured) ++ progress_reporter = JobsServiceProgressReporter.create_progress_reporter(sdk, job_ctx) ++ runner = FileIORunner(sdk=sdk, progress_reporter=progress_reporter, job_ctx=job_ctx) ++ ++ config = get_config(job_ctx.config_path) ++ ++ logger.info(f"Starting file I/O task with job context: {job_ctx}") ++ logger.info(f"Config: {config.model_dump_json(indent=2)}") ++ logger.info(f"NeMo Platform service URL: {sdk.base_url}") ++ ++ # Execute uploads if configured ++ runner.run_upload(config.upload) ++ ++ # Execute downloads if configured ++ runner.run_download(config.download) ++ ++ # Report overall completion ++ progress_reporter.update_progress( ++ status=PlatformJobStatus.COMPLETED, ++ status_details={"phase": TaskPhase.COMPLETED, "message": "File I/O task completed successfully"}, ++ ) ++ ++ return 0 ++ except PathTraversalError as e: ++ logger.error(f"Security error - path traversal detected: {e}") ++ if progress_reporter: ++ progress_reporter.update_progress( ++ status=PlatformJobStatus.ERROR, ++ error_details={"message": str(e), "type": type(e).__name__}, ++ ) ++ return 1 ++ except (FileDownloadError, FileUploadError) as e: ++ logger.exception(f"File operation failed: {e}") ++ if progress_reporter: ++ progress_reporter.update_progress( ++ status=PlatformJobStatus.ERROR, ++ error_details={"message": str(e), "type": type(e).__name__}, ++ ) ++ return 1 ++ ++ except Exception as e: ++ logger.exception(f"File I/O task failed: {e}") ++ if progress_reporter: ++ progress_reporter.update_progress( ++ status=PlatformJobStatus.ERROR, ++ error_details={"message": str(e), "type": type(e).__name__}, ++ ) ++ return 1 ++ ++ finally: ++ if sdk_owned and sdk is not None: ++ sdk.close() +diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/utils.py b/services/automodel/src/nmp/automodel/tasks/file_io/utils.py +new file mode 100644 +index 000000000..e80910524 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/file_io/utils.py +@@ -0,0 +1,184 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import json ++import logging ++from collections.abc import Iterator ++from contextlib import contextmanager ++from pathlib import Path ++ ++import httpx ++ ++# https://docs.nvidia.com/nemo/microservices/latest/pysdk/index.html#handling-errors ++from nemo_platform import ( ++ APIConnectionError, ++ APIStatusError, ++ APITimeoutError, ++ AuthenticationError, ++ PermissionDeniedError, ++) ++from nmp.automodel.app.jobs.file_io.schemas import ( ++ FileDownloadError, ++ FileIOTaskConfig, ++ FileUploadError, ++ PathTraversalError, ++ ProgressReportError, ++) ++ ++logger = logging.getLogger(__name__) ++ ++ ++@contextmanager ++def filesystem_sdk_error_handler( ++ error_class: type[FileDownloadError | FileUploadError | ProgressReportError], ++ operation: str, ++ passthrough: tuple[type[BaseException], ...] = (), ++) -> Iterator[None]: ++ """Context manager for consistent FilesetFileSystem error handling. ++ ++ Catches FilesetFileSystem-specific exceptions and re-raises them as the specified error class ++ with a consistent message format. ++ ++ Args: ++ error_class: The exception class to raise (FileDownloadError or FileUploadError). ++ operation: Description of the operation for error messages (e.g., "download file.txt from fileset x/y"). ++ passthrough: Tuple of exception types to pass through without handling. Allows handling of exceptions outside of the context manager. ++ ++ Raises: ++ error_class: With a descriptive message including the error details. ++ ++ """ ++ try: ++ yield ++ except passthrough: ++ raise ++ except FileNotFoundError as e: ++ raise error_class(f"Failed to {operation} due to file not found error. Error: {e}") from e ++ except PermissionError as e: ++ raise error_class(f"Failed to {operation} due to permission denied error. Error: {e}") from e ++ except httpx.TimeoutException as e: ++ raise error_class(f"Failed to {operation} due to request timeout. Error: {e}") from e ++ except httpx.ConnectError as e: ++ raise error_class(f"Failed to {operation} due to connection error. Error: {e}") from e ++ except Exception as e: ++ raise error_class(f"Failed to {operation} due to unexpected error {type(e).__name__}: {e}") from e ++ ++ ++@contextmanager ++def sdk_error_handler( ++ error_class: type[FileDownloadError | FileUploadError | ProgressReportError], ++ operation: str, ++ passthrough: tuple[type[BaseException], ...] = (), ++) -> Iterator[None]: ++ """Context manager for consistent SDK error handling. ++ ++ Catches SDK-specific exceptions and re-raises them as the specified error class ++ with a consistent message format. ++ ++ Args: ++ error_class: The exception class to raise (FileDownloadError or FileUploadError). ++ operation: Description of the operation for error messages (e.g., "download file.txt from fileset x/y"). ++ passthrough: Tuple of exception types to pass through without handling. Allows handling of exceptions outside of the context manager. ++ ++ Raises: ++ error_class: With a descriptive message including the error details. ++ ++ """ ++ try: ++ yield ++ except passthrough: ++ raise ++ except APITimeoutError as e: ++ raise error_class( ++ f"Failed to {operation} due to request timeout error. Cause: {e.__cause__}. Error: {e}", ++ ) from e ++ except APIConnectionError as e: ++ raise error_class(f"Failed to {operation} due to connection error. Cause: {e.__cause__}. Error: {e}") from e ++ # Note: AuthenticationError and PermissionDeniedError are subclasses of APIStatusError, ++ # so they must be caught before APIStatusError ++ except AuthenticationError as e: ++ raise error_class(f"Failed to {operation} due to authentication error. Error: {e}") from e ++ except PermissionDeniedError as e: ++ raise error_class(f"Failed to {operation} due to permission denied error. Error: {e}") from e ++ except APIStatusError as e: ++ raise error_class(f"Failed to {operation} due to API error. Status code: {e.status_code}. Error: {e}") from e ++ except Exception as e: ++ raise error_class(f"Failed to {operation} due to unexpected error {type(e).__name__}: {e}") from e ++ ++ ++def get_config(config_path: Path) -> FileIOTaskConfig: ++ """Get typed task configuration from a config file. ++ ++ Loads the JSON config file and validates it against the FileIOTaskConfig schema. ++ ++ Args: ++ config_path: Path to the JSON configuration file. ++ ++ Returns: ++ Validated FileIOTaskConfig. ++ """ ++ with open(config_path) as f: ++ data = json.load(f) ++ return FileIOTaskConfig.model_validate(data) ++ ++ ++def validate_storage_path(storage_path: Path) -> Path: ++ """Validate that a storage path exists and is a directory. ++ ++ Args: ++ storage_path: The storage path to validate. ++ ++ Returns: ++ The validated storage path. ++ ++ Raises: ++ FileUploadError: If the storage path does not exist or is not a directory. ++ """ ++ if not storage_path.exists() or not storage_path.is_dir(): ++ raise FileUploadError( ++ f"Storage path does not exist: {storage_path}. Ensure the storage path exists and is a directory.", ++ ) ++ return storage_path ++ ++ ++def validate_safe_path(base_path: Path, user_path: str) -> Path: ++ """Validate that a user-provided path stays within the base directory. ++ ++ Prevents path traversal attacks where user input like "../../etc/passwd" could ++ escape the intended directory. The function resolves both paths to their ++ canonical absolute forms and verifies the result is under the base path. ++ ++ Args: ++ base_path: The base directory that the resolved path must stay within. ++ user_path: The user-provided relative path (e.g., from config). ++ ++ Returns: ++ The resolved absolute path that is guaranteed to be within base_path. ++ ++ Raises: ++ PathTraversalError: If the resolved path would escape base_path. ++ ++ Examples: ++ >>> base = Path("/var/storage") ++ >>> validate_safe_path(base, "subdir/file.txt") ++ PosixPath('/var/storage/subdir/file.txt') ++ ++ >>> validate_safe_path(base, "../../etc/passwd") ++ Raises PathTraversalError ++ ++ """ ++ # Resolve base_path to absolute canonical form ++ resolved_base = base_path.resolve() ++ ++ # Join and resolve the user path ++ # Using resolve() handles .., ., symlinks, etc. ++ resolved_path = (base_path / user_path).resolve() ++ ++ if not resolved_path.is_relative_to(resolved_base): ++ raise PathTraversalError( ++ f"Path '{user_path}' resolves outside of the base directory. " ++ "This may indicate a path traversal attack. " ++ "Ensure that paths such as ../.. are not used in the download destination path.", ++ ) ++ ++ return resolved_path +diff --git a/services/automodel/src/nmp/automodel/tasks/model_entity/__init__.py b/services/automodel/src/nmp/automodel/tasks/model_entity/__init__.py +new file mode 100644 +index 000000000..49784f0f5 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/model_entity/__init__.py +@@ -0,0 +1,8 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Model entity task for creating model entities after customization.""" ++ ++from nmp.automodel.tasks.model_entity.run import run ++ ++__all__ = ["run"] +diff --git a/services/automodel/src/nmp/automodel/tasks/model_entity/__main__.py b/services/automodel/src/nmp/automodel/tasks/model_entity/__main__.py +new file mode 100644 +index 000000000..90a4ffe62 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/model_entity/__main__.py +@@ -0,0 +1,15 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Entry point for model_entity task. ++ ++Usage: ++ python -m nmp.automodel.tasks.model_entity ++""" ++ ++import sys ++ ++from .run import run ++ ++if __name__ == "__main__": ++ sys.exit(run()) +diff --git a/services/automodel/src/nmp/automodel/tasks/model_entity/run.py b/services/automodel/src/nmp/automodel/tasks/model_entity/run.py +new file mode 100644 +index 000000000..9d8af54cd +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/model_entity/run.py +@@ -0,0 +1,436 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Model entity task entry point. ++ ++Handles creating model entities in the Models service after customization completes. ++ ++The task reads configuration and creates a Model Entity that references the ++uploaded model artifacts in the Files service. ++ ++Usage: ++ export NEMO_JOB_STEP_CONFIG_FILE_PATH= ++ python -m nmp.automodel.tasks.model_entity ++""" ++ ++import json ++import logging ++import re ++import time ++from pathlib import Path ++ ++from nemo_platform import ( ++ APIConnectionError, ++ APITimeoutError, ++ ConflictError, ++ InternalServerError, ++ NeMoPlatform, ++ NotFoundError, ++) ++from nemo_platform.types.inference import ( ++ ModelDeploymentConfig, ++ ModelDeploymentConfigFilterParam, ++ ModelDeploymentFilterParam, ++ NIMDeploymentParam, ++) ++from nemo_platform.types.models import LoraParam, ModelEntity ++from nemo_platform.types.shared_params.tool_call_config import ToolCallConfig as ToolCallConfigParam ++from nmp.common.sdk_factory import get_task_sdk ++from nmp.automodel.app.constants import SERVICE_NAME ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nmp.automodel.app.jobs.model_entity.schemas import ( ++ DeploymentParameters, ++ ModelEntityCreationError, ++ ModelEntityTaskConfig, ++) ++from nmp.automodel.entities.values import FinetuningType ++from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential ++ ++logger = logging.getLogger(__name__) ++ ++# Retry configuration ++MAX_RETRIES = 3 ++INITIAL_BACKOFF_SECONDS = 1.0 ++MAX_BACKOFF_SECONDS = 30.0 ++ ++ACTIVE_DEPLOYMENT_STATUSES = frozenset({"CREATED", "PENDING", "READY"}) ++ ++SPEC_POLL_INTERVAL_SECONDS = 10 ++SPEC_POLL_TIMEOUT_SECONDS = 600 ++ ++ ++def get_config(config_path: Path) -> ModelEntityTaskConfig: ++ """Get typed task configuration from a config file. ++ ++ Loads the JSON config file and validates it against the ModelEntityTaskConfig schema. ++ ++ Args: ++ config_path: Path to the JSON configuration file. ++ ++ Returns: ++ Validated ModelEntityTaskConfig. ++ """ ++ with open(config_path) as f: ++ data = json.load(f) ++ return ModelEntityTaskConfig.model_validate(data) ++ ++ ++def sanitize_name(prefix: str, name: str) -> str: ++ """Sanitize model_name: keep only allowed chars, replace invalid with hyphen, avoid consecutive/trailing hyphens ++ ++ Must be compatible with - {'pattern': '^[a-z](?!.*--)[a-z0-9\\-@.+_]{1,62}(? ModelEntity: ++ """Poll until the model_spec task has populated the model's spec. ++ ++ The spec must be populated before creating a deployment because the ++ inference service relies on ``spec.family`` and ``spec.base_num_parameters`` ++ to select the correct NIM profile. ++ ++ Raises: ++ ModelEntityCreationError: If the spec is not populated within the timeout. ++ """ ++ logger.info(f"Waiting for model_spec to populate spec on {workspace}/{name}") ++ start = time.monotonic() ++ ++ while time.monotonic() - start < SPEC_POLL_TIMEOUT_SECONDS: ++ try: ++ target = self.sdk.models.retrieve(name=name, workspace=workspace) ++ if target.spec: ++ logger.info(f"Spec populated on {workspace}/{name}") ++ return target ++ except (APIConnectionError, APITimeoutError, InternalServerError) as e: ++ logger.warning(f"Transient error polling spec for {workspace}/{name}: {e}") ++ time.sleep(SPEC_POLL_INTERVAL_SECONDS) ++ ++ raise ModelEntityCreationError( ++ f"Timed out waiting for model spec on {workspace}/{name} " ++ f"after {SPEC_POLL_TIMEOUT_SECONDS}s. The platform could not auto-detect the " ++ f"model's specifications. Verify the model checkpoint is valid and in a supported format." ++ ) ++ ++ def get_model_entity(self, model_entity: str, fileset_workspace: str) -> ModelEntity: ++ parts = model_entity.split("/") ++ if len(parts) == 1: ++ me_workspace = fileset_workspace ++ me_name = parts[0] ++ else: ++ me_workspace = parts[0] ++ me_name = parts[1] ++ ++ try: ++ me: ModelEntity = self.sdk.models.retrieve(name=me_name, workspace=me_workspace) ++ except NotFoundError as e: ++ raise ModelEntityCreationError(f"Model entity {me_workspace}/{me_name} not found") from e ++ ++ return me ++ ++ @retry( ++ stop=stop_after_attempt(MAX_RETRIES), ++ wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), ++ retry=retry_if_exception_type((InternalServerError, APITimeoutError, APIConnectionError)), ++ reraise=True, ++ ) ++ def create_model_entity(self, config: ModelEntityTaskConfig) -> tuple[dict, ModelEntity]: ++ """Create a model entity in the Models service. ++ ++ Args: ++ config: Configuration for the model entity to create. ++ ++ Returns: ++ Tuple of (result dict, deploy target). For LoRA the deploy target is the ++ base model entity; for SFT it is the newly created output model entity. ++ ++ Raises: ++ ModelEntityCreationError: If creation fails. ++ """ ++ workspace = self.job_ctx.workspace ++ logger.info(f"Creating model entity: {workspace}/{config.name}") ++ ++ fileset_workspace = config.fileset.workspace or workspace ++ fileset_ref = f"{fileset_workspace}/{config.fileset.name}" ++ ++ logger.info(f"Validating fileset exists: {fileset_workspace}/{config.fileset.name}") ++ try: ++ self.sdk.files.filesets.retrieve(workspace=fileset_workspace, name=config.fileset.name) ++ logger.info(f"Fileset validation successful: {fileset_workspace}/{config.fileset.name}") ++ except Exception as e: ++ logger.error(f"Fileset validation failed: {fileset_workspace}/{config.fileset.name}") ++ raise ModelEntityCreationError( ++ f"Cannot create model entity: fileset '{fileset_workspace}/{config.fileset.name}' does not exist or is not accessible" ++ ) from e ++ ++ base_me: ModelEntity = self.get_model_entity(config.model_entity, fileset_workspace) ++ ++ if config.peft is not None and config.peft.type == FinetuningType.LORA: ++ try: ++ output_me = self.sdk.models.adapters.create( ++ model_name=base_me.name, ++ workspace=base_me.workspace, ++ name=config.name, ++ description=config.description, ++ fileset=fileset_ref, ++ finetuning_type=config.peft.type.value, ++ lora_config=LoraParam( ++ alpha=config.peft.alpha, ++ rank=config.peft.rank, ++ ), ++ enabled=True, ++ ) ++ return output_me.model_dump(), base_me ++ except ConflictError: ++ logger.warning( ++ f"adapter {base_me.workspace}/{config.name} already exists for model {base_me.workspace}/{base_me.name}, updating with new fileset" ++ ) ++ try: ++ output_me = self.sdk.models.adapters.update( ++ adapter=config.name, ++ model_name=base_me.name, ++ workspace=base_me.workspace, ++ fileset=fileset_ref, ++ description=config.description, ++ enabled=True, ++ ) ++ logger.info( ++ f"Successfully updated adapter: {base_me.workspace}/{config.name} for base model {base_me.workspace}/{base_me.name}" ++ ) ++ return output_me.model_dump(), base_me ++ except (InternalServerError, APITimeoutError, APIConnectionError): ++ raise ++ except Exception as update_error: ++ logger.exception( ++ f"Failed to update existing adapter, {base_me.workspace}/{config.name}: {update_error}" ++ ) ++ raise ModelEntityCreationError( ++ f"Adapter '{config.name}' already exists but update failed: {update_error}" ++ ) from update_error ++ except Exception as e: ++ logger.exception(f"Failed to create model adapter: {e}") ++ raise ModelEntityCreationError(f"Failed to create model adapter: {e}") from e ++ else: ++ ft_type = config.peft.type.value if config.peft else FinetuningType.ALL_WEIGHTS.value ++ ++ request_body = { ++ "name": config.name, ++ "description": config.description, ++ "fileset": fileset_ref, ++ "finetuning_type": ft_type, ++ "trust_remote_code": base_me.trust_remote_code, ++ } ++ ++ if config.base_model: ++ request_body["base_model"] = config.base_model ++ ++ try: ++ output_me = self.sdk.models.create( ++ workspace=workspace, ++ **request_body, ++ ) ++ logger.info(f"Successfully created model entity: {output_me.workspace}/{output_me.name}") ++ return output_me.model_dump(), output_me ++ ++ except ConflictError: ++ logger.warning(f"Model entity already exists: {workspace}/{config.name}, updating existing model") ++ try: ++ update_body = {k: v for k, v in request_body.items() if k != "name"} ++ output_me = self.sdk.models.update( ++ name=config.name, ++ workspace=workspace, ++ **update_body, ++ ) ++ logger.info(f"Successfully updated model entity: {output_me.workspace}/{output_me.name}") ++ return output_me.model_dump(), output_me ++ except (InternalServerError, APITimeoutError, APIConnectionError): ++ raise ++ except Exception as update_error: ++ logger.exception(f"Failed to update existing model entity: {update_error}") ++ raise ModelEntityCreationError( ++ f"Model entity '{config.name}' already exists and update failed: {update_error}" ++ ) from update_error ++ ++ except Exception as e: ++ logger.exception(f"Failed to create model entity: {e}") ++ raise ModelEntityCreationError(f"Failed to create model entity: {e}") from e ++ ++ def launch_model(self, config: ModelEntityTaskConfig, me: ModelEntity): ++ """Deploy a model entity after creation. ++ ++ For LoRA jobs, ``me`` should be the base model entity. ++ For SFT jobs, ``me`` should be the output model entity. ++ """ ++ dc = config.deployment_config ++ if dc is None: ++ return ++ ++ # LORA_MERGED produces a full-weight model, so it is deployed like SFT ++ # and intentionally excluded from LoRA-specific checks below. ++ is_lora = config.peft is not None and config.peft.type == FinetuningType.LORA ++ if is_lora and self._has_active_deployment(me): ++ return ++ ++ if is_lora and isinstance(dc, DeploymentParameters) and not dc.lora_enabled: ++ logger.warning(f"Deployment requested but lora_enabled is false for a LoRA job: {dc}") ++ return ++ ++ # Resolve an existing config or create a new one from inline params. ++ if isinstance(dc, str): ++ logger.info(f"Resolving deployment config reference: {dc}") ++ deployment_config = self._resolve_config_ref(dc, me.workspace) ++ logger.info(f"Using deployment config: {deployment_config.workspace}/{deployment_config.name}") ++ else: ++ deployment_config = self._create_deployment_config(dc, me) ++ ++ self._create_deployment(deployment_config, me) ++ ++ def _has_active_deployment(self, me: ModelEntity) -> bool: ++ """Check if the model entity already has an active deployment.""" ++ deployment_configs = self.sdk.inference.deployment_configs.list( ++ workspace=me.workspace, ++ filter=ModelDeploymentConfigFilterParam(model_entity_id=f"{me.workspace}/{me.name}"), ++ ).data ++ ++ for c in deployment_configs: ++ deployments = self.sdk.inference.deployments.list( ++ filter=ModelDeploymentFilterParam(config=c.name, workspace=me.workspace) ++ ).data ++ for d in deployments: ++ if d.status in ACTIVE_DEPLOYMENT_STATUSES: ++ logger.info(f"Active deployment (status={d.status}) exists for config {c.name}, skipping") ++ return True ++ ++ return False ++ ++ def _resolve_config_ref(self, config_ref: str, me_workspace: str) -> ModelDeploymentConfig: ++ """Resolve a ``name`` or ``workspace/name`` reference to a ModelDeploymentConfig.""" ++ parts = config_ref.split("/") ++ if len(parts) == 2: ++ workspace = parts[0] ++ name = parts[1] ++ elif len(parts) == 1: ++ workspace = me_workspace ++ name = parts[0] ++ else: ++ raise ModelEntityCreationError( ++ f"Invalid deployment config reference '{config_ref}': expected 'name' or 'workspace/name'" ++ ) ++ ++ try: ++ return self.sdk.inference.deployment_configs.retrieve(workspace=workspace, name=name) ++ except Exception as e: ++ raise ModelEntityCreationError( ++ f"Failed to resolve deployment config '{config_ref}' in workspace '{workspace}': {e}" ++ ) from e ++ ++ def _create_deployment_config(self, deploy_params: DeploymentParameters, me: ModelEntity) -> ModelDeploymentConfig: ++ """Create (or update) a ModelDeploymentConfig from inline parameters.""" ++ nim_deployment = NIMDeploymentParam( ++ image_name=deploy_params.image_name, ++ image_tag=deploy_params.image_tag, ++ gpu=deploy_params.gpu, ++ model_name=me.name, ++ model_namespace=me.workspace, ++ additional_envs=deploy_params.additional_envs, ++ lora_enabled=deploy_params.lora_enabled, ++ ) ++ ++ if deploy_params.tool_call_config: ++ nim_deployment["tool_call_config"] = ToolCallConfigParam( ++ **deploy_params.tool_call_config.model_dump(exclude_none=True) ++ ) ++ ++ deployment_cfg_name = sanitize_name("sft-cfg", me.name) ++ try: ++ return self.sdk.inference.deployment_configs.create( ++ workspace=me.workspace, ++ name=deployment_cfg_name, ++ nim_deployment=nim_deployment, ++ ) ++ except ConflictError: ++ logger.info(f"Deployment config {me.workspace}/{deployment_cfg_name} already exists, updating") ++ return self.sdk.inference.deployment_configs.update( ++ workspace=me.workspace, ++ name=deployment_cfg_name, ++ nim_deployment=nim_deployment, ++ ) ++ ++ def _create_deployment(self, deployment_config: ModelDeploymentConfig, me: ModelEntity) -> None: ++ """Create a deployment from the given ModelDeploymentConfig.""" ++ logger.info(f"Deployment config: {deployment_config}") ++ ++ if not me.spec: ++ _ = self._wait_for_spec(me.workspace, me.name) ++ ++ deployment_name = sanitize_name("sft-deploy", me.name) ++ try: ++ deployment = self.sdk.inference.deployments.create( ++ workspace=deployment_config.workspace, ++ name=deployment_name, ++ config=deployment_config.name, ++ ) ++ logger.info(f"Deployment created: {deployment}") ++ except ConflictError: ++ logger.info(f"Deployment {deployment_config.workspace}/{deployment_name} already exists") ++ deployment = self.sdk.inference.deployments.retrieve( ++ workspace=deployment_config.workspace, ++ name=deployment_name, ++ ) ++ ++ deployment_status = self.sdk.inference.deployments.retrieve( ++ workspace=deployment.workspace, ++ name=deployment.name, ++ ) ++ logger.info(f"Deployment status: {deployment_status}") ++ ++ ++def run(sdk: NeMoPlatform | None = None, job_ctx: NMPJobContext | None = None) -> int: ++ """Execute the model entity creation task. ++ ++ Args: ++ sdk: Optional SDK instance for dependency injection (for testing). ++ If None, creates one via get_task_sdk(). ++ job_ctx: Optional job context for dependency injection (for testing). ++ If None, creates one via NMPJobContext.from_env(). ++ ++ Returns: ++ Exit code (0 for success, non-zero for failure). ++ """ ++ job_ctx = job_ctx or NMPJobContext.from_env() ++ ++ sdk_owned = sdk is None ++ try: ++ sdk = sdk or get_task_sdk(SERVICE_NAME).with_options(workspace=job_ctx.workspace) ++ runner = ModelEntityRunner(sdk=sdk, job_ctx=job_ctx) ++ ++ config = get_config(job_ctx.config_path) ++ ++ logger.info(f"Starting model entity task with job context: {job_ctx}") ++ logger.info(f"Config: {config.model_dump_json(indent=2)}") ++ logger.info(f"NeMo Platform service URL: {sdk.base_url}") ++ ++ result, deploy_target = runner.create_model_entity(config) ++ logger.info(f"Model entity creation complete: {result}") ++ ++ runner.launch_model(config, deploy_target) ++ return 0 ++ ++ except ModelEntityCreationError as e: ++ logger.exception(f"Model entity creation failed: {e}") ++ return 1 ++ except Exception as e: ++ logger.exception(f"Model entity task failed: {e}") ++ return 1 ++ finally: ++ if sdk_owned and sdk is not None: ++ sdk.close() +diff --git a/services/automodel/src/nmp/automodel/tasks/progress_reporter.py b/services/automodel/src/nmp/automodel/tasks/progress_reporter.py +new file mode 100644 +index 000000000..82bb23616 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/progress_reporter.py +@@ -0,0 +1,12 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Re-export file_io progress types for backward-compatible imports.""" ++ ++from nmp.automodel.tasks.file_io.progress_reporter import ( ++ JobsServiceProgressReporter, ++ NoOpProgressReporter, ++ ProgressReporter, ++) ++ ++__all__ = ["JobsServiceProgressReporter", "NoOpProgressReporter", "ProgressReporter"] +diff --git a/services/automodel/src/nmp/automodel/tasks/training/__init__.py b/services/automodel/src/nmp/automodel/tasks/training/__init__.py +new file mode 100644 +index 000000000..e69de29bb +diff --git a/services/automodel/src/nmp/automodel/tasks/training/__main__.py b/services/automodel/src/nmp/automodel/tasks/training/__main__.py +new file mode 100644 +index 000000000..f4397f999 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/__main__.py +@@ -0,0 +1,40 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Training task entry point. ++ ++Usage: ++ python -m nmp.automodel.tasks.training ++ ++In distributed (multi-node) training, all pods run this entry point. ++The DistributedContext handles role detection and coordination: ++- Rank 0 (coordinator): Runs all phases, reports progress ++- Rank > 0 (workers): Participate in training, wait at barriers ++""" ++ ++import logging ++import sys ++ ++from .runner import TrainingRunner ++ ++logger = logging.getLogger(__name__) ++ ++ ++def run() -> int: ++ """Execute training task.""" ++ try: ++ with TrainingRunner() as runner: ++ result = runner.run() ++ return 0 if result.success else 1 ++ except Exception as e: ++ logger.exception(f"Training task failed: {e}") ++ return 1 ++ ++ ++if __name__ == "__main__": ++ logging.basicConfig( ++ level=logging.INFO, ++ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ++ ) ++ sys.exit(run()) +diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/__init__.py b/services/automodel/src/nmp/automodel/tasks/training/backends/__init__.py +new file mode 100644 +index 000000000..13ea2859f +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/backends/__init__.py +@@ -0,0 +1,20 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import warnings ++ ++from pydantic.warnings import UnsupportedFieldAttributeWarning ++ ++warnings.filterwarnings("ignore", category=UnsupportedFieldAttributeWarning) ++ ++warnings.filterwarnings( ++ "ignore", ++ category=UserWarning, ++ module="torch.distributed.device_mesh", ++) ++ ++warnings.filterwarnings( ++ "ignore", ++ category=UserWarning, ++ module="nemo_automodel.components.moe.state_dict_utils", ++) +diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/backend.py b/services/automodel/src/nmp/automodel/tasks/training/backends/backend.py +new file mode 100644 +index 000000000..d5fba7bb1 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/backends/backend.py +@@ -0,0 +1,187 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import logging ++import signal ++import subprocess ++import threading ++import time ++from collections import deque ++from pathlib import Path ++from typing import Any, Optional ++ ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nmp.automodel.tasks.training.errors.parser import ( ++ MAX_OUTPUT_LINES, ++ parse_error_from_output, ++ read_subprocess_output, ++) ++from nmp.automodel.tasks.training.progress import JobsServiceProgressReporter ++from nmp.automodel.tasks.training.protocol import LibraryConfig ++from nmp.automodel.tasks.training.schemas import ( ++ CheckpointInfo, ++ TrainingMetrics, ++ TrainingStepConfig, ++) ++from nmp.automodel.tasks.training.utils import generate_torchrun_flags_from_env ++ ++from .checkpoints import ModelType, find_best_checkpoint, process_checkpoint ++from .config import compile_automodel_config ++ ++logger = logging.getLogger(__name__) ++ ++AUTOMODEL_CONFIG_FILENAME = "automodel_config.yaml" ++ ++ ++class AutomodelBackend: ++ """Compiles and runs nemo-automodel training for customization jobs.""" ++ ++ def __init__(self, job_ctx: NMPJobContext): ++ self.job_ctx = job_ctx ++ ++ def compile_config( ++ self, ++ config: TrainingStepConfig, ++ workspace_dir: Path, ++ ) -> dict[str, Any]: ++ """ ++ Compile Automodel-specific configuration. ++ ++ Pure transformation - no file I/O. The runner handles writing to disk. ++ """ ++ return compile_automodel_config(config, workspace_dir, self.job_ctx) ++ ++ def execute_training( ++ self, ++ customizer_config: TrainingStepConfig, ++ library_config: LibraryConfig, ++ progress: JobsServiceProgressReporter, ++ ) -> TrainingMetrics: ++ """Execute training using CustomizerTrainFinetuneRecipe or CustomizerBiencoderRecipe. ++ ++ The config file has already been written to disk by the runner. ++ Progress reporting happens within the training subprocess via ++ TrainingProgressCallback, which reads job context from environment ++ variables. ++ """ ++ progress.report_running("training", backend="automodel") ++ ++ # Run training with our custom recipe ++ # Note: The progress parameter is not passed to run_training_with_customizer_recipe ++ # because progress reporting now happens inside the subprocess via ++ # TrainingProgressCallback using environment variables. ++ command = ["torchrun"] ++ command.extend(generate_torchrun_flags_from_env()) ++ command.extend( ++ [ ++ "-m", ++ "nmp.automodel.tasks.training.backends.finetune", ++ "--config", ++ str(library_config.config_path), ++ ] ++ ) ++ ++ logger.info(f"Executing: {' '.join(command)}") ++ ++ training_process: subprocess.Popen | None = None ++ ++ # Rolling buffer to keep recent output lines for error extraction ++ output_lines: deque[str] = deque(maxlen=MAX_OUTPUT_LINES) ++ reader_thread: threading.Thread | None = None ++ ++ def cleanup(signum, frame): ++ logger.warning(f"Signal {signum} received, terminating...") ++ if training_process: ++ training_process.send_signal(signum) ++ try: ++ training_process.wait(timeout=30) ++ except subprocess.TimeoutExpired: ++ training_process.kill() ++ raise SystemExit(signum) ++ ++ signal.signal(signal.SIGINT, cleanup) ++ signal.signal(signal.SIGTERM, cleanup) ++ ++ start_time = time.time() ++ ++ training_process = subprocess.Popen( ++ command, ++ stdout=subprocess.PIPE, ++ stderr=subprocess.STDOUT, ++ text=True, ++ bufsize=1, # Line buffered ++ ) ++ ++ # Start reader thread to capture output without blocking ++ reader_thread = threading.Thread( ++ target=read_subprocess_output, ++ args=(training_process, output_lines), ++ daemon=True, ++ ) ++ reader_thread.start() ++ ++ try: ++ training_process.wait(timeout=customizer_config.training_timeout) ++ except subprocess.TimeoutExpired: ++ logger.exception("Training timed out") ++ training_process.kill() ++ # Reap the killed process to avoid zombies ++ try: ++ training_process.wait(timeout=30) ++ except subprocess.TimeoutExpired: ++ logger.warning( ++ "Killed training process did not terminate within 30s - " ++ "process may be stuck in uninterruptible state" ++ ) ++ # Wait for reader thread to capture any remaining output before re-raising ++ if reader_thread and reader_thread.is_alive(): ++ reader_thread.join(timeout=5) ++ raise # Let runner.py convert via create_error_details() ++ ++ # Wait for reader thread to finish capturing output ++ if reader_thread and reader_thread.is_alive(): ++ reader_thread.join(timeout=5) ++ ++ duration = time.time() - start_time ++ logger.info(f"Training finished in {duration:.1f} seconds") ++ ++ if training_process.returncode != 0: ++ parsed = parse_error_from_output(output_lines, training_process.returncode) ++ raise parsed.to_exception() ++ ++ # Return empty metrics (actual metrics are reported via callbacks during training) ++ # TODO: Consider parsing training logs or checkpoints to extract final metrics. ++ return TrainingMetrics(total_steps=0, total_epochs=0) ++ ++ def find_best_checkpoint( ++ self, ++ workspace_dir: Path, ++ customizer_config: TrainingStepConfig, ++ library_config: Optional[LibraryConfig] = None, ++ ) -> Path: ++ """Find best Automodel checkpoint.""" ++ model_type = ModelType.EMBEDDING if customizer_config.model.is_embedding_model else ModelType.LLM ++ return find_best_checkpoint(workspace_dir, customizer_config, model_type=model_type) ++ ++ def process_checkpoint( ++ self, ++ checkpoint_path: Path, ++ output_path: Path, ++ customizer_config: TrainingStepConfig, ++ library_config: LibraryConfig | None = None, ++ ) -> CheckpointInfo: ++ """Process Automodel checkpoint.""" ++ model_type = ModelType.EMBEDDING if customizer_config.model.is_embedding_model else ModelType.LLM ++ ++ # Extract resolved chat template from library config if available (LLM only) ++ resolved_template = None ++ if model_type == ModelType.LLM and library_config and library_config.config_dict: ++ resolved_template = library_config.config_dict.get("_resolved_chat_template") ++ ++ return process_checkpoint( ++ checkpoint_path, ++ output_path, ++ customizer_config, ++ model_type=model_type, ++ resolved_chat_template=resolved_template, ++ ) +diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/callbacks.py b/services/automodel/src/nmp/automodel/tasks/training/backends/callbacks.py +new file mode 100644 +index 000000000..04c7b40c2 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/backends/callbacks.py +@@ -0,0 +1,94 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import logging ++ ++from nmp.automodel.tasks.training.progress import JobsServiceProgressReporter ++ ++logger = logging.getLogger(__name__) ++ ++ ++class TrainingProgressCallback: ++ """ ++ Callback for reporting Automodel training progress to the Jobs service. ++ ++ This class composes JobsServiceProgressReporter and provides training-specific ++ methods for reporting detailed metrics during training. ++ ++ Metric accumulation: train_loss and val_loss are accumulated as time-series ++ lists and included in every status_details update under a ``metrics`` key, ++ enabling loss-curve reconstruction from job status. ++ """ ++ ++ def __init__(self, reporter: JobsServiceProgressReporter): ++ self._reporter = reporter ++ ++ prior = reporter.fetch_current_metrics() ++ self._train_metrics: list[dict[str, float | int]] = prior.get("train_loss", []) ++ self._val_metrics: list[dict[str, float | int]] = prior.get("val_loss", []) ++ if self._train_metrics or self._val_metrics: ++ logger.info( ++ "Seeded metrics from server: %d train_loss, %d val_loss entries", ++ len(self._train_metrics), ++ len(self._val_metrics), ++ ) ++ ++ def _build_metrics_summary(self) -> dict[str, list[dict[str, float | int]]]: ++ """Build the accumulated metrics payload for inclusion in status_details.""" ++ return { ++ "train_loss": list(self._train_metrics), ++ "val_loss": list(self._val_metrics), ++ } ++ ++ def report_training_start(self, max_steps: int, num_epochs: int) -> None: ++ """Report that training has started with schedule information.""" ++ self._reporter.configure_progress_tracking(max_steps, num_epochs) ++ self._reporter.report_running(phase="training", step=0, max_steps=max_steps, num_epochs=num_epochs) ++ ++ def report_train_step( ++ self, ++ step: int, ++ epoch: int, ++ loss: float, ++ lr: float | None = None, ++ grad_norm: float | None = None, ++ ) -> None: ++ """Report training step with metrics.""" ++ self._train_metrics.append({"step": step, "epoch": epoch, "value": loss}) ++ self._reporter.report_running( ++ phase="training", ++ step=step, ++ epoch=epoch, ++ train_loss=loss, ++ lr=lr, ++ grad_norm=grad_norm, ++ metrics=self._build_metrics_summary(), ++ ) ++ ++ def report_validation(self, step: int, epoch: int, val_loss: float) -> None: ++ """Report validation results.""" ++ self._val_metrics.append({"step": step, "epoch": epoch, "value": val_loss}) ++ self._reporter.report_running( ++ phase="validation", ++ step=step, ++ epoch=epoch, ++ val_loss=val_loss, ++ metrics=self._build_metrics_summary(), ++ ) ++ ++ def report_checkpoint_saved(self, step: int, epoch: int, checkpoint_path: str | None = None) -> None: ++ """Report that a checkpoint was saved.""" ++ self._reporter.report_running( ++ phase="checkpoint_saved", ++ step=step, ++ epoch=epoch, ++ checkpoint_path=checkpoint_path, ++ ) ++ ++ def report_epoch_end(self, step: int, epoch: int) -> None: ++ """Report that an epoch has completed.""" ++ self._reporter.report_running(phase="epoch_end", step=step, epoch=epoch) ++ ++ def close(self) -> None: ++ """Clean up resources.""" ++ self._reporter.close() +diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/checkpoints.py b/services/automodel/src/nmp/automodel/tasks/training/backends/checkpoints.py +new file mode 100644 +index 000000000..32d852b9c +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/backends/checkpoints.py +@@ -0,0 +1,522 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Checkpoint processing for Automodel backend. ++ ++This module handles: ++- Finding the best checkpoint after training ++- LoRA adapter merging ++- Chat template preservation ++- FSDP2 architecture fix ++- HF export and format conversion ++- ONNX export for embedding models ++ ++Supports both LLM and embedding (biencoder) models through unified functions. ++""" ++ ++import json ++import logging ++import re ++import shutil ++from enum import StrEnum ++from pathlib import Path ++ ++from nmp.automodel.tasks.training.chat_templates import ( ++ apply_chat_template_to_checkpoint, ++ resolve_chat_template, ++) ++from nmp.automodel.tasks.training.schemas import ( ++ CheckpointFormat, ++ CheckpointInfo, ++ FinetuningType, ++ Precision, ++ TrainingStepConfig, ++) ++ ++logger = logging.getLogger(__name__) ++ ++ ++class ModelType(StrEnum): ++ """Type of model for checkpoint processing.""" ++ ++ LLM = "llm" ++ EMBEDDING = "embedding" ++ ++ ++def extract_precision_from_model_config(model_path: str | Path) -> Precision | None: ++ """ ++ Extract precision from a HuggingFace model's config.json. ++ ++ HuggingFace models store their torch_dtype in config.json (e.g., "bfloat16"). ++ This function reads that value and maps it to our Precision enum. ++ ++ This is used to determine the actual training precision when "auto" was used ++ for torch_dtype. The precision comes from the base model's config, not from ++ the output checkpoint (which may only contain adapter weights for LoRA). ++ ++ Args: ++ model_path: Path to the model directory containing config.json ++ ++ Returns: ++ Precision enum value if found, None otherwise ++ """ ++ config_path = Path(model_path) / "config.json" ++ if not config_path.exists(): ++ logger.warning(f"config.json not found at {config_path}, cannot extract precision") ++ return None ++ ++ try: ++ with open(config_path, "r") as f: ++ config = json.load(f) ++ ++ torch_dtype = config.get("torch_dtype") ++ if torch_dtype is None: ++ logger.warning("torch_dtype not found in config.json") ++ return None ++ ++ try: ++ precision = Precision.from_hf_dtype(torch_dtype) ++ logger.info(f"Extracted precision from model config: {torch_dtype} -> {precision.value}") ++ return precision ++ except ValueError: ++ logger.warning(f"Unknown torch_dtype '{torch_dtype}' in config.json, cannot map to Precision") ++ return None ++ ++ except (json.JSONDecodeError, IOError) as e: ++ logger.warning(f"Failed to read config.json: {e}") ++ return None ++ ++ ++def extract_step_number(path: Path) -> int: ++ """Extract step number from directory name like 'epoch_0_step_99'""" ++ match = re.search(r"step_(\d+)", path.name) ++ return int(match.group(1)) if match else -1 ++ ++ ++def get_model_dir_from_checkpoint(checkpoint_dir: Path, is_peft: bool) -> Path: ++ """ ++ Extract model directory from checkpoint directory. ++ """ ++ if is_peft: ++ # For LoRA, checkpoint is saved directly under model/ directory ++ model_dir = checkpoint_dir / "model" ++ if model_dir.exists() and model_dir.is_dir(): ++ logger.info(f"Found LoRA checkpoint at: {model_dir}") ++ return model_dir.resolve() ++ else: ++ # For full-sft, check for consolidated directory first ++ consolidated_dir = checkpoint_dir / "model" / "consolidated" ++ if consolidated_dir.exists() and consolidated_dir.is_dir(): ++ logger.info(f"Found consolidated checkpoint at: {consolidated_dir}") ++ return consolidated_dir.resolve() ++ ++ # Fallback to model/ directory if consolidated doesn't exist ++ model_dir = checkpoint_dir / "model" ++ if model_dir.exists() and model_dir.is_dir(): ++ logger.info(f"Found sharded checkpoint at: {model_dir}") ++ return model_dir.resolve() ++ ++ raise FileNotFoundError(f"Model directory not found in checkpoint {checkpoint_dir}") ++ ++ ++def find_best_checkpoint( ++ workspace_dir: Path, ++ config: TrainingStepConfig, ++ model_type: ModelType = ModelType.LLM, ++) -> Path: ++ """ ++ Find the best checkpoint directory. ++ """ ++ base_dir = workspace_dir / "checkpoints" ++ is_peft = config.training.finetuning_type in (FinetuningType.LORA, FinetuningType.LORA_MERGED) ++ type_label = "embedding" if model_type == ModelType.EMBEDDING else "" ++ ++ # Order of preference: ++ # 1. LOWEST_VAL symlink ++ # 2. LATEST symlink ++ # 3. Highest step number ++ ++ for link_name in ["LOWEST_VAL", "LATEST"]: ++ link = base_dir / link_name ++ if link.exists() and link.is_symlink(): ++ try: ++ target = link.resolve() ++ if target.exists(): ++ logger.info(f"Using {link_name} {type_label} checkpoint: {target.name}".replace(" ", " ")) ++ return get_model_dir_from_checkpoint(target, is_peft) ++ except Exception as e: ++ logger.warning(f"Failed to resolve {link_name} symlink: {e}") ++ ++ # Fallback: scan directories ++ epoch_step_dirs = list(base_dir.glob("epoch_*_step_*")) ++ if not epoch_step_dirs: ++ raise FileNotFoundError(f"No {type_label} checkpoint directories found in {base_dir}".replace(" ", " ")) ++ ++ best_checkpoint = max(epoch_step_dirs, key=extract_step_number) ++ logger.info(f"Using latest {type_label} checkpoint by step number: {best_checkpoint.name}".replace(" ", " ")) ++ return get_model_dir_from_checkpoint(best_checkpoint, is_peft) ++ ++ ++def fix_fsdp2_architecture(model_path: Path) -> None: ++ """ ++ Fix FSDP2 architecture naming issue in HuggingFace config. ++ ++ FSDP2 adds "FSDP" prefix to architecture names (e.g., "FSDPLlamaForCausalLM" ++ instead of "LlamaForCausalLM"). This function removes that prefix to ensure ++ the checkpoint is compatible with standard HuggingFace/vLLM loading. ++ ++ Reference: https://github.com/huggingface/transformers/commit/dc262ee6f57f2154f5233e53482da14dbe3be834 ++ """ ++ config_path = model_path / "config.json" ++ if not config_path.exists(): ++ logger.warning(f"config.json not found at {config_path}, skipping FSDP2 fix") ++ return ++ ++ with open(config_path, "r") as f: ++ config = json.load(f) ++ ++ if "architectures" not in config: ++ return ++ ++ original_archs = config["architectures"] ++ fixed_archs = [arch.removeprefix("FSDP") for arch in original_archs] ++ ++ if original_archs != fixed_archs: ++ config["architectures"] = fixed_archs ++ with open(config_path, "w") as f: ++ json.dump(config, f, indent=2) ++ logger.info(f"Fixed FSDP2 architecture names: {original_archs} -> {fixed_archs}") ++ ++ ++def merge_lora_adapter( ++ adapter_path: Path, ++ base_model_path: str, ++ output_path: Path, ++) -> None: ++ """ ++ Merge LoRA adapter weights into the base model. ++ ++ Uses HuggingFace's PEFT library to: ++ 1. Load the base model ++ 2. Attach the LoRA adapter ++ 3. Merge weights using merge_and_unload() ++ 4. Save as a standard HuggingFace checkpoint ++ ++ Note: This function only supports LLM models. For embedding models, ++ use merge_lora_embedding_adapter() instead. ++ ++ Args: ++ adapter_path: Path to the LoRA adapter checkpoint ++ base_model_path: Path to the base model (for loading weights) ++ output_path: Where to save the merged model ++ """ ++ try: ++ import torch ++ from peft import PeftModel ++ from transformers import AutoModelForCausalLM, AutoTokenizer ++ except ImportError as e: ++ raise ImportError( ++ "LoRA merge requires 'peft' and 'transformers' packages. Ensure they are installed in the container." ++ ) from e ++ ++ logger.info(f"Merging LoRA adapter from {adapter_path} with base model {base_model_path}") ++ ++ # Use scratch directory if available for better I/O performance ++ tmp_path = Path("/scratch/merged_lora") if Path("/scratch").is_dir() else Path("/tmp/merged_lora") ++ shutil.rmtree(tmp_path, ignore_errors=True) ++ tmp_path.mkdir(parents=True, exist_ok=True) ++ ++ try: ++ # 1. Load base model in mergeable dtype (not quantized) ++ logger.info("Loading base model...") ++ model = AutoModelForCausalLM.from_pretrained( ++ base_model_path, ++ torch_dtype=torch.bfloat16, ++ device_map="auto", ++ trust_remote_code=True, ++ ) ++ ++ # 2. Attach the LoRA adapter ++ logger.info("Loading LoRA adapter...") ++ model = PeftModel.from_pretrained(model, str(adapter_path)) ++ ++ # 3. Merge LoRA weights into base model ++ logger.info("Merging LoRA weights...") ++ model = model.merge_and_unload() ++ ++ # 4. Save merged model ++ logger.info(f"Saving merged model to {tmp_path}...") ++ model.save_pretrained(tmp_path, safe_serialization=True) ++ ++ # 5. Save tokenizer from base model ++ tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True) ++ tokenizer.save_pretrained(tmp_path) ++ ++ # 6. Copy to output path ++ output_path.mkdir(parents=True, exist_ok=True) ++ shutil.copytree(tmp_path, output_path, dirs_exist_ok=True) ++ ++ logger.info(f"Successfully merged LoRA adapter to {output_path}") ++ ++ finally: ++ # Cleanup temp directory ++ shutil.rmtree(tmp_path, ignore_errors=True) ++ ++ ++def merge_lora_embedding_adapter( ++ adapter_path: Path, ++ base_model_path: str, ++ output_path: Path, ++) -> None: ++ """Merge a LoRA adapter into a base embedding model. ++ ++ This intentionally mirrors the logic in Automodel's `tools/merge_lora.py`, ++ but is implemented locally because the customizer container may not have ++ that module on `PYTHONPATH`. ++ ++ Args: ++ adapter_path: Path to the PEFT adapter directory. ++ base_model_path: HuggingFace model name or path for the base encoder. ++ output_path: Where to write the merged model. ++ """ ++ try: ++ import gc ++ ++ import torch ++ from peft import PeftModel ++ from transformers import AutoModel, AutoTokenizer ++ except ImportError as e: ++ raise ImportError( ++ "LoRA merge requires 'peft' and 'transformers' packages. Ensure they are installed in the container." ++ ) from e ++ ++ logger.info("Merging embedding LoRA adapter from %s with base model %s", adapter_path, base_model_path) ++ ++ # Use scratch directory if available for better I/O performance ++ tmp_path = Path("/scratch/merged_lora") if Path("/scratch").is_dir() else Path("/tmp/merged_lora") ++ shutil.rmtree(tmp_path, ignore_errors=True) ++ tmp_path.mkdir(parents=True, exist_ok=True) ++ ++ try: ++ logger.info("Loading base model (AutoModel): %s", base_model_path) ++ model = AutoModel.from_pretrained( ++ base_model_path, ++ torch_dtype=torch.float16, ++ device_map="auto", ++ trust_remote_code=True, ++ ) ++ ++ logger.info("Loading adapter from %s", adapter_path) ++ model = PeftModel.from_pretrained(model, str(adapter_path)) ++ ++ logger.info("Merging adapter into base model") ++ model = model.merge_and_unload() ++ ++ logger.info("Saving merged model to %s", tmp_path) ++ model.save_pretrained(tmp_path, safe_serialization=True) ++ ++ try: ++ tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True) ++ tokenizer.save_pretrained(tmp_path) ++ logger.info("Tokenizer saved to %s", tmp_path) ++ except Exception as e: ++ logger.warning("Could not save tokenizer: %s", e) ++ ++ output_path.mkdir(parents=True, exist_ok=True) ++ shutil.copytree(tmp_path, output_path, dirs_exist_ok=True) ++ logger.info("Successfully merged embedding LoRA adapter to %s", output_path) ++ ++ finally: ++ shutil.rmtree(tmp_path, ignore_errors=True) ++ try: ++ del model ++ except Exception: ++ pass ++ torch.cuda.empty_cache() ++ gc.collect() ++ ++ ++def export_onnx( ++ model_path: Path, ++ output_path: Path, ++ tokenizer_path: str, ++) -> Path: ++ """Export an embedding model to ONNX format. ++ ++ Uses Automodel's export_to_onnx to export to ONNX format. ++ The resulting `model.onnx` is written into *output_path* alongside ++ the existing HuggingFace checkpoint files. ++ ++ Args: ++ model_path: Path to the HuggingFace model directory (config.json + weights). ++ output_path: Directory where ``model.onnx`` will be written. ++ tokenizer_path: Fallback tokenizer location (base model path). Used when ++ the checkpoint directory does not contain tokenizer files. ++ ++ Returns: ++ Path to the exported ``model.onnx`` file. ++ """ ++ # need to import here for the tests ++ from nemo_automodel.components.models.biencoder.export_onnx import export_to_onnx ++ ++ logger.info(f"Exporting embedding model at path {model_path} to ONNX format at path {output_path}") ++ ++ try: ++ onnx_path = export_to_onnx( ++ model_path=str(model_path), ++ output_dir=str(output_path), ++ tokenizer_path=tokenizer_path, ++ pooling="avg", ++ normalize=True, ++ opset=17, ++ export_dtype="fp16", ++ verify=True, ++ ) ++ except Exception: ++ logger.exception(f"ONNX export failed for model at {model_path}") ++ raise ++ ++ logger.info(f"ONNX model exported to {onnx_path}") ++ return Path(onnx_path) ++ ++ ++_ONNX_TOP_LEVEL_PATTERNS = {"model.onnx", "model.onnx.data", "tokenizer"} ++ ++ ++def _restructure_embedding_output(output_path: Path) -> None: ++ """Move HF artifacts into ``alternates/hf/`` so the NIM selects the ONNX profile. ++ ++ NIM scans the top-level directory to choose the model backend. If it sees ++ ``.safetensors`` files it creates a PyTorch profile, which is unsupported ++ for custom models in many NIM versions. The legacy customizer kept only ++ ``model.onnx`` (+ tokenizer/) at the root and placed HF weights under ++ ``alternates/hf/``. This function reproduces that layout. ++ """ ++ alternates_hf = output_path / "alternates" / "hf" ++ alternates_hf.mkdir(parents=True, exist_ok=True) ++ ++ for entry in list(output_path.iterdir()): ++ if entry.name in _ONNX_TOP_LEVEL_PATTERNS or entry.name == "alternates": ++ continue ++ dest = alternates_hf / entry.name ++ logger.info("Moving %s -> %s", entry, dest) ++ shutil.move(str(entry), str(dest)) ++ ++ logger.info("Restructured embedding output: ONNX at top level, HF in alternates/hf/") ++ ++ ++def process_checkpoint( ++ checkpoint_path: Path, ++ output_path: Path, ++ customizer_config: TrainingStepConfig, ++ model_type: ModelType = ModelType.LLM, ++ resolved_chat_template: str | None = None, ++) -> CheckpointInfo: ++ """ ++ Process checkpoint to standard output format. ++ ++ Works for both LLM and embedding (biencoder) models. ++ ++ Handles three scenarios: ++ 1. Full weights training: Copy checkpoint, fix FSDP2 arch, preserve chat template (LLM only) ++ 2. LoRA (unmerged): Copy adapter, preserve format as hf-peft ++ 3. LoRA merged: Merge adapter with base model, output as standard HF ++ ++ Args: ++ checkpoint_path: Path to the checkpoint directory (model files) ++ output_path: Where to write the processed checkpoint ++ customizer_config: Training configuration with model paths and settings ++ model_type: Type of model ("llm" or "embedding") ++ resolved_chat_template: Pre-resolved chat template from training config (LLM only). ++ If provided, this template is used. Otherwise, falls back to ++ priority-based resolution using model.name and model.path. ++ ++ Returns: ++ CheckpointInfo with output path, format, and precision ++ """ ++ output_path.mkdir(parents=True, exist_ok=True) ++ ++ finetuning_type = customizer_config.training.finetuning_type ++ base_model_path = customizer_config.model.path ++ is_embedding = model_type == ModelType.EMBEDDING ++ type_label = "embedding" if is_embedding else "" ++ ++ # Resolve chat template using the same priority logic as training: ++ # 1. Use pre-resolved template if provided (ensures consistency with training) ++ # 2. Otherwise, resolve using priority-based selection ++ chat_template: str | None = None ++ if not is_embedding: ++ if resolved_chat_template is not None: ++ chat_template = resolved_chat_template ++ logger.info("Using pre-resolved chat template from training config") ++ else: ++ # Fall back to priority-based resolution (user_template from fileset metadata takes priority) ++ chat_template = resolve_chat_template( ++ model_path=base_model_path, ++ model_name=customizer_config.model.name, ++ user_template=customizer_config.model.chat_template, ++ ) ++ ++ if finetuning_type == FinetuningType.LORA_MERGED: ++ # LoRA merged: merge adapter weights into base model ++ # For embedding models, this produces a full-weight model compatible with ONNX export and NIM serving. ++ if is_embedding: ++ merge_lora_embedding_adapter( ++ adapter_path=checkpoint_path, ++ base_model_path=base_model_path, ++ output_path=output_path, ++ ) ++ else: ++ merge_lora_adapter( ++ adapter_path=checkpoint_path, ++ base_model_path=base_model_path, ++ output_path=output_path, ++ ) ++ checkpoint_format = CheckpointFormat.HF ++ ++ # Fix FSDP2 architecture naming ++ fix_fsdp2_architecture(output_path) ++ # Apply chat template for LLM models only ++ if chat_template: ++ apply_chat_template_to_checkpoint(output_path, chat_template) ++ ++ elif finetuning_type == FinetuningType.LORA: ++ # LoRA unmerged: just copy the adapter files ++ logger.info(f"Copying {type_label} LoRA adapter from {checkpoint_path} to {output_path}".replace(" ", " ")) ++ shutil.copytree(checkpoint_path, output_path, dirs_exist_ok=True) ++ checkpoint_format = CheckpointFormat.HF_PEFT ++ # Note: For hf-peft, chat template is inherited from base model at inference time ++ ++ else: ++ # Full weights training: copy and process ++ logger.info( ++ f"Copying {type_label} full weights checkpoint from {checkpoint_path} to {output_path}".replace(" ", " ") ++ ) ++ shutil.copytree(checkpoint_path, output_path, dirs_exist_ok=True) ++ checkpoint_format = CheckpointFormat.HF ++ ++ # Fix FSDP2 architecture naming ++ fix_fsdp2_architecture(output_path) ++ # Apply chat template for LLM models only ++ if chat_template: ++ apply_chat_template_to_checkpoint(output_path, chat_template) ++ ++ if is_embedding: ++ export_onnx( ++ model_path=output_path, ++ output_path=output_path, ++ tokenizer_path=base_model_path, ++ ) ++ _restructure_embedding_output(output_path) ++ ++ # Determine precision: use explicit config value, or extract from base model ++ precision = customizer_config.model.precision ++ if precision is None: ++ precision = extract_precision_from_model_config(customizer_config.model.path) ++ ++ return CheckpointInfo( ++ path=str(output_path), ++ format=checkpoint_format, ++ precision=precision, ++ ) +diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/config.py b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py +new file mode 100644 +index 000000000..b51fba84e +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py +@@ -0,0 +1,848 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Automodel configuration compiler. ++ ++This module transforms the standardized TrainingStepConfig into the format ++expected by nemo_automodel's TrainFinetuneRecipeForNextTokenPrediction ++or KnowledgeDistillationRecipeForNextTokenPrediction. ++""" ++ ++import logging ++import os ++from pathlib import Path ++from typing import Any ++ ++from nemo_automodel._transformers.registry import ModelRegistry ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nmp.automodel.tasks.training.chat_templates import resolve_chat_template ++from nmp.automodel.tasks.training.datasets.preparation import ( ++ DatasetSchema, ++ PreparedDataset, ++ compute_val_check_interval, ++ detect_dataset_schema, ++ prepare_dataset, ++) ++from nmp.automodel.tasks.training.datasets.validation import DatasetValidator ++from nmp.automodel.tasks.training.integrations import ( ++ build_mlflow_config, ++ build_wandb_config, ++) ++from nmp.automodel.tasks.training.schemas import ( ++ EmbeddingConfig, ++ FinetuningType, ++ LoRAConfig, ++ TrainingStepConfig, ++ TrainingType, ++) ++from nmp.automodel.tasks.training.sequence_packing import ( ++ calculate_optimal_pack_size, ++ estimate_dataset_sequence_lengths, ++) ++ ++logger = logging.getLogger(__name__) ++ ++ ++def compile_automodel_config( ++ customizer_config: TrainingStepConfig, ++ workspace_dir: Path, ++ job_ctx: NMPJobContext, ++) -> dict[str, Any]: ++ """ ++ Compile Automodel-specific configuration. ++ ++ This transforms the standardized TrainingStepConfig into the format ++ expected by nemo_automodel's TrainFinetuneRecipeForNextTokenPrediction. ++ """ ++ cfg: dict[str, Any] = {} ++ _is_embedding_model = customizer_config.model.is_embedding_model ++ trust_remote_code = customizer_config.model.trust_remote_code ++ embedding_config = EmbeddingConfig() ++ ++ # === Distributed Environment === ++ # Required for torch.distributed initialization ++ cfg["dist_env"] = { ++ "backend": "nccl", ++ "timeout_minutes": 30, # Higher timeout for large model loading ++ } ++ ++ # === Random Number Generator === ++ # Both recipes use StatefulRNG for reproducibility across restarts and multi-node training, ++ # but they expect the config in different formats: ++ # - Biencoder recipe: expects cfg["seed"] and creates StatefulRNG internally ++ # - LLM recipe: expects cfg["rng"] with full StatefulRNG config ++ seed = int(os.environ.get("PL_GLOBAL_SEED", customizer_config.seed)) ++ ++ if _is_embedding_model: ++ # Biencoder recipe creates StatefulRNG from seed value internally ++ # See: nemo_automodel/recipes/biencoder/train_biencoder.py ++ cfg["seed"] = seed ++ else: ++ # LLM recipe expects the full rng config object ++ cfg["rng"] = { ++ "_target_": "nemo_automodel.components.training.rng.StatefulRNG", ++ "seed": seed, ++ "ranked": True, # Different seed per rank for data augmentation ++ } ++ ++ # === Model Configuration === ++ # Common fields shared by both embedding and causal LM models ++ cfg["model"] = { ++ "pretrained_model_name_or_path": customizer_config.model.path, ++ "torch_dtype": customizer_config.model.precision.to_torch_dtype() ++ if customizer_config.model.precision ++ else "auto", ++ # trust_remote_code is required for models like nvidia/llama-nemotron-embed-1b-v2 ++ # which use custom model_type "llama_bidirec" with custom modeling code. ++ "trust_remote_code": trust_remote_code, ++ } ++ if customizer_config.model.override_custom_impl: ++ cfg["model"]["force_hf"] = True ++ ++ if _is_embedding_model: ++ cfg["model"].update( ++ { ++ "_target_": "nemo_automodel.components.models.biencoder.NeMoAutoModelBiencoder.from_pretrained", ++ # Use the same encoder for both queries and passages. default value taken from Automodel example ++ "share_encoder": True, ++ # Add a trainable linear layer after pooling to reduce embedding dimension. default value taken from Automodel example ++ "add_linear_pooler": False, ++ # How to combine token embeddings into a single document/query embedding. default value taken from Automodel example ++ "pooling": "avg", ++ # Normalize embeddings to unit length (length = 1). default value taken from Automodel example ++ "l2_normalize": True, ++ # When training an embedding model, we want it to learn that similar things should have similar embeddings ++ # and different things should have different embeddings. ++ # Temperature controls how "strict" the model is when learning these relationships. ++ # Low value (0.02), tells the model to pick the correct doc and penalizes near-misses. ++ # High value (like 1.0 that's Automodel default) tells the model to be more lenient and allows for near-misses. ++ # 0.02 is taken from the Automodel example for biencoder training. ++ "t": 0.02, ++ # Total number of passages per query during training: 1 positive + (n-1) negatives. ++ # For example, train_n_passages=5 means 1 positive and 4 negative passages per query. ++ # This differs from legacy Customizer's 'num_hard_negatives' which only counted negatives ++ # (num_hard_negatives=4 is equivalent to train_n_passages=5). ++ "train_n_passages": embedding_config.train_n_passages, ++ # Number of negative passages per query during validation. ++ "eval_negative_size": get_eval_negative_size(embedding_config), ++ # Gradient checkpointing saves memory by not storing all activations during forward pass. ++ # Instead, it recomputes them during backward pass with a memory trade-off - less memory, slower training. ++ # Useful for large models or limited GPU memory. ++ # TODO: consider exposing this in CustomizationJobInput ++ "do_gradient_checkpointing": embedding_config.do_gradient_checkpointing, ++ "use_liger_kernel": True, ++ "use_sdpa_patching": True, ++ } ++ ) ++ ++ # === Tokenizer === ++ cfg["tokenizer"] = { ++ "_target_": "nemo_automodel._transformers.auto_tokenizer.NeMoAutoTokenizer.from_pretrained", ++ "pretrained_model_name_or_path": customizer_config.model.path, ++ } ++ else: ++ cfg["model"].update( ++ { ++ "_target_": "nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained", ++ "attn_implementation": customizer_config.model.attn_implementation, ++ } ++ ) ++ ++ # === Distributed Configuration === ++ p = customizer_config.parallelism ++ total_gpus = p.num_nodes * p.num_gpus_per_node ++ # Note dp_size is typically auto-derived by Automodel (world_size / (tp * pp * cp)), ++ # but we calculate it explicitly here because: ++ # 1. It's validated upstream in validators.py ++ # 2. We need it for warmup_steps validation below ++ # 3. Passing an explicit value ensures consistency rather than relying on Automodel's derivation ++ dp = total_gpus // (p.tensor_parallel_size * p.pipeline_parallel_size * p.context_parallel_size) ++ ++ cfg["distributed"] = { ++ "_target_": "nemo_automodel.components.distributed.fsdp2.FSDP2Manager", ++ "dp_size": dp, ++ "tp_size": p.tensor_parallel_size, ++ "pp_size": p.pipeline_parallel_size, ++ "cp_size": p.context_parallel_size, ++ "ep_size": p.expert_parallel_size, ++ "sequence_parallel": p.sequence_parallel, ++ } ++ if p.pipeline_parallel_size > 1: ++ cfg["distributed"]["pipeline"] = { ++ "pp_schedule": "interleaved1f1b", ++ "pp_microbatch_size": 1, ++ "scale_grads_in_schedule": False, ++ } ++ ++ # === Dataset Preparation === ++ # Discover, merge, and optionally split dataset files ++ prepared = prepare_dataset( ++ dataset_path=Path(customizer_config.dataset.path), ++ output_dir=workspace_dir / "dataset", ++ seed=customizer_config.seed, ++ ) ++ logger.info( ++ f"Prepared dataset: train={prepared.train_samples} samples, validation={prepared.validation_samples} samples, files: " ++ f"train={prepared.train_file.absolute()}, validation={prepared.validation_file.absolute()}" ++ ) ++ validator = DatasetValidator(training_type=customizer_config.training.training_type) ++ validator.validate_dataset(str(prepared.train_file)) ++ validator.validate_dataset(str(prepared.validation_file)) ++ logger.info("Validated datasets successfully") ++ ++ # === Step Scheduler (with val_check_interval conversion) === ++ batch_size = customizer_config.batch.global_batch_size ++ epochs = customizer_config.schedule.epochs ++ ++ # Compute steps per epoch (round up to ensure all samples are used) ++ steps_per_epoch = (prepared.train_samples + batch_size - 1) // batch_size ++ total_steps = steps_per_epoch * epochs ++ ++ # Determine effective max_steps ++ user_max_steps = customizer_config.schedule.max_steps ++ if user_max_steps and user_max_steps > 0: ++ max_steps = min(user_max_steps, total_steps) ++ else: ++ max_steps = total_steps ++ ++ logger.info( ++ f"Training schedule: {prepared.train_samples} samples, batch_size={batch_size}, " ++ f"steps_per_epoch={steps_per_epoch}, epochs={epochs}, max_steps={max_steps}" ++ ) ++ ++ cfg["step_scheduler"] = { ++ "global_batch_size": batch_size, ++ "local_batch_size": customizer_config.batch.micro_batch_size, ++ "max_steps": max_steps, ++ "num_epochs": epochs, ++ } ++ ++ val_every_steps = compute_val_check_interval( ++ steps_per_epoch=steps_per_epoch, ++ max_steps=max_steps, ++ val_check_interval=customizer_config.schedule.val_check_interval, ++ ) ++ cfg["step_scheduler"]["val_every_steps"] = val_every_steps ++ cfg["step_scheduler"]["ckpt_every_steps"] = val_every_steps ++ logger.info(f"Validation interval: {customizer_config.schedule.val_check_interval} -> {val_every_steps} steps") ++ ++ # === Validate warmup_steps === ++ # Automodel requires: lr_warmup_steps < lr_decay_steps (scheduler.py line 96) ++ # lr_decay_steps = total_optimizer_steps (accounting for gradient accumulation) ++ warmup_steps = customizer_config.optimizer.warmup_steps ++ if warmup_steps > 0: ++ micro_batch_size = customizer_config.batch.micro_batch_size ++ ++ # Calculate gradient accumulation steps (how StepScheduler computes it) ++ grad_acc_steps = batch_size // (micro_batch_size * dp) ++ ++ # Calculate total optimizer steps (accounting for gradient accumulation) ++ total_optimizer_steps = (epochs * prepared.train_samples) // grad_acc_steps ++ ++ # lr_decay_steps will be min(max_steps, total_optimizer_steps) ++ lr_decay_steps = min(total_optimizer_steps, max_steps) ++ ++ if warmup_steps >= lr_decay_steps: ++ raise ValueError( ++ f"warmup_steps ({warmup_steps}) must be less than lr_decay_steps ({lr_decay_steps}). " ++ f"Calculation: grad_acc_steps={grad_acc_steps} (batch_size={batch_size} / " ++ f"(micro_batch_size={micro_batch_size} * dp_size={dp})), " ++ f"total_optimizer_steps={total_optimizer_steps} (epochs={epochs} * " ++ f"steps_per_epoch={prepared.train_samples} / grad_acc_steps={grad_acc_steps}), " ++ f"lr_decay_steps=min({total_optimizer_steps}, {max_steps})={lr_decay_steps}" ++ ) ++ ++ # === Optimizer === ++ cfg["optimizer"] = { ++ "_target_": "torch.optim.Adam", ++ "lr": customizer_config.optimizer.learning_rate, ++ "weight_decay": customizer_config.optimizer.weight_decay, ++ "betas": [customizer_config.optimizer.beta1, customizer_config.optimizer.beta2], ++ "eps": customizer_config.optimizer.eps, # Adam epsilon for numerical stability ++ } ++ ++ cfg["lr_scheduler"] = { ++ "lr_decay_style": "cosine", ++ "lr_warmup_steps": customizer_config.optimizer.warmup_steps, ++ } ++ if customizer_config.optimizer.min_learning_rate: ++ cfg["lr_scheduler"]["min_lr"] = customizer_config.optimizer.min_learning_rate ++ ++ # === Checkpoint === ++ cfg["checkpoint"] = { ++ "enabled": True, ++ "model_save_format": "safetensors", ++ "checkpoint_dir": str(workspace_dir / "checkpoints"), ++ "save_consolidated": True, ++ # Required for models with quantized base weights (e.g., GPT-OSS) ++ # Safe to enable even for non-quantized models ++ "dequantize_base_checkpoint": True, ++ "v4_compatible": customizer_config.model.v4_compatible, ++ } ++ ++ # === Sequence Packing (must be computed before dataset config) === ++ # When packing is enabled, we use the pack size as the effective sequence length ++ # for dataset configuration. This ensures samples are truncated appropriately. ++ effective_seq_length = customizer_config.model.max_seq_length ++ if not _is_embedding_model: ++ if customizer_config.batch.sequence_packing: ++ # Calculate optimal pack size based on dataset statistics ++ packing_estimate = estimate_dataset_sequence_lengths( ++ customizer_config, ++ train_file=prepared.train_file, ++ max_samples=customizer_config.batch.sequence_packing_max_samples, ++ seed=customizer_config.seed, ++ trust_remote_code=trust_remote_code, ++ ) ++ ++ if packing_estimate is not None: ++ optimal_pack_size = packing_estimate.pack_size ++ logger.info( ++ f"Sequence packing enabled: pack_size={optimal_pack_size}, " ++ f"avg_seq={packing_estimate.avg_seq_length}, max_seq={packing_estimate.max_seq_length}, " ++ f"packing_factor={packing_estimate.packing_factor}, samples={packing_estimate.samples_analyzed}" ++ ) ++ else: ++ # Fallback to conservative default (model max_seq_length) ++ optimal_pack_size = calculate_optimal_pack_size(customizer_config) ++ logger.info(f"Sequence packing enabled with conservative pack_size={optimal_pack_size}") ++ ++ cfg["packed_sequence"] = { ++ "packed_sequence_size": optimal_pack_size, ++ "split_across_pack": False, ++ } ++ ++ # Use pack size as the effective sequence length for datasets ++ effective_seq_length = optimal_pack_size ++ ++ # === Dataset Configuration (with schema detection) === ++ _configure_datasets( ++ cfg, ++ customizer_config, ++ prepared, ++ effective_seq_length, ++ seed, ++ _is_embedding_model, ++ embedding_config, ++ ) ++ ++ # === Dataloader === ++ # Embedding datasets configure their own specialized dataloaders in _configure_embedding_dataset ++ if not _is_embedding_model: ++ cfg["dataloader"] = { ++ "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", ++ "collate_fn": "nemo_automodel.components.datasets.utils.default_collater", ++ "shuffle": True, ++ } ++ cfg["validation_dataloader"] = { ++ "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", ++ "collate_fn": "nemo_automodel.components.datasets.utils.default_collater", ++ } ++ ++ # === PEFT (LoRA) === ++ if customizer_config.training.training_type in ( ++ TrainingType.SFT, ++ TrainingType.DISTILLATION, ++ ) and customizer_config.training.finetuning_type in (FinetuningType.LORA, FinetuningType.LORA_MERGED): ++ lora = customizer_config.training.lora ++ if lora is None: ++ lora = LoRAConfig() ++ peft_cfg: dict[str, Any] = { ++ "_target_": "nemo_automodel.components._peft.lora.PeftConfig", ++ "dim": lora.rank, ++ "alpha": lora.alpha, ++ "dropout": lora.dropout, ++ "use_triton": lora.use_triton, ++ "target_modules": lora.target_modules, ++ } ++ # TODO: Support exclude_modules via the API ++ # if lora.exclude_modules: ++ # peft_cfg["exclude_modules"] = lora.exclude_modules ++ cfg["peft"] = peft_cfg ++ ++ # === Loss === ++ if not _is_embedding_model: ++ cfg["loss_fn"] = { ++ "_target_": "nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy", ++ } ++ ++ # === Custom Model Configuration === ++ # Check for custom Automodel implementations (e.g., MoE models) ++ # and configure backend/parallelizer settings ++ if not _is_embedding_model: ++ _configure_moe_backend(cfg, customizer_config, trust_remote_code=trust_remote_code) ++ ++ # === Knowledge Distillation === ++ if customizer_config.training.training_type == TrainingType.DISTILLATION: ++ _configure_kd(cfg, customizer_config, trust_remote_code=trust_remote_code) ++ ++ # === Integrations (Runtime Environment) === ++ ++ # WandB - check for API key in environment ++ wandb_config = build_wandb_config( ++ customizer_config=customizer_config, ++ job_ctx=job_ctx, ++ framework="automodel", ++ ) ++ if wandb_config: ++ cfg["wandb"] = wandb_config ++ logger.info(f"WandB enabled: project={wandb_config.get('project')}") ++ ++ # MLflow ++ mlflow_config = build_mlflow_config( ++ customizer_config=customizer_config, ++ job_ctx=job_ctx, ++ framework="automodel", ++ ) ++ if mlflow_config: ++ cfg["mlflow"] = mlflow_config ++ logger.info(f"MLflow enabled: {mlflow_config.get('tracking_uri')}") ++ ++ return cfg ++ ++ ++def _configure_moe_backend( ++ cfg: dict[str, Any], customizer_config: TrainingStepConfig, trust_remote_code: bool = False ++) -> None: ++ """ ++ Configure custom Automodel model implementations for MoE models. ++ ++ Automodel has optimized implementations for certain model architectures. ++ Only MoE models (those with num_local_experts, num_experts, or n_routed_experts in config) ++ require additional backend and parallelizer configuration. ++ ++ Dense models like LlamaForCausalLM may have custom Automodel implementations ++ (for combined QKV projections, etc.) but don't need MoE-specific config. ++ ++ This function: ++ 1. Detects if the model is an MoE model via config attributes ++ 2. Only for MoE: Configures the backend (with deepep disabled for stability) ++ 3. Only for MoE: Configures the parallelizer for expert distribution ++ """ ++ # Import here to avoid ModuleNotFoundError in environments where ++ # transformers is not installed (e.g., during test collection) ++ from transformers import AutoConfig ++ ++ model_path = customizer_config.model.path ++ ++ try: ++ hf_config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code) ++ architectures = getattr(hf_config, "architectures", None) ++ ++ # Check if model has a custom Automodel implementation ++ has_custom_impl = ( ++ architectures and len(architectures) > 0 and architectures[0] in ModelRegistry.model_arch_name_to_cls ++ ) ++ ++ if has_custom_impl: ++ # Check if model is MoE by looking for expert-related config attributes ++ # MoE models use num_local_experts (Mixtral-style), num_experts (older), or n_routed_experts (NemotronH) ++ num_experts = ( ++ getattr(hf_config, "num_local_experts", None) ++ or getattr(hf_config, "num_experts", None) ++ or getattr(hf_config, "n_routed_experts", None) ++ ) ++ is_moe_model = num_experts is not None and num_experts > 1 ++ if is_moe_model: ++ logger.info( ++ f"Detected MoE model with custom Automodel implementation for architecture: {architectures[0]}. " ++ f"Adding MoE-specific configurations (num_experts={num_experts})." ++ ) ++ ++ # Validate MoE parallelism constraints. ++ # Automodel's MoE parallelizer does not support tensor parallelism: ++ # assert tp_axis_name is None or world_mesh[tp_axis_name].size() == 1 ++ # See: nemo_automodel/components/moe/parallelizer.py ++ p = customizer_config.parallelism ++ total_gpus = p.num_nodes * p.num_gpus_per_node ++ if total_gpus > 1: ++ if p.tensor_parallel_size > 1: ++ raise ValueError( ++ f"Tensor parallelism (tensor_parallel_size={p.tensor_parallel_size}) is not supported for MoE models." ++ ) ++ ep = p.expert_parallel_size ++ if ep is None or ep <= 1: ++ raise ValueError( ++ f"MoE model detected (num_experts={num_experts}) but expert_parallel_size " ++ f"is {ep or 'not set'}. Multi-GPU MoE training requires expert_parallel_size > 1." ++ ) ++ ++ # Backend configuration for MoE models ++ # DeepEP is disabled for stability - it's a newer feature that can cause issues ++ cfg.setdefault("model", {})["backend"] = { ++ "_target_": "nemo_automodel.components.models.common.utils.BackendConfig", ++ "enable_deepep": False, ++ } ++ ++ else: ++ logger.info( ++ f"Detected custom Automodel implementation for architecture: {architectures[0]}. " ++ "Not an MoE model, skipping MoE-specific configurations." ++ ) ++ else: ++ logger.debug( ++ f"No custom Automodel implementation found for {model_path}. " ++ "Using standard HuggingFace model implementation." ++ ) ++ except ValueError: ++ raise # Re-raise validation errors ++ except Exception as e: ++ # Don't fail training if we can't check for custom implementations ++ logger.warning( ++ f"Failed to check for custom model implementation: {e}. Using standard HuggingFace model implementation." ++ ) ++ ++ ++def _configure_datasets( ++ cfg: dict[str, Any], ++ customizer_config: TrainingStepConfig, ++ prepared: PreparedDataset, ++ seq_length: int, ++ seed: int, ++ is_embedding_model: bool = False, ++ embedding_config: EmbeddingConfig | None = None, ++) -> None: ++ """ ++ Configure dataset sections based on detected schema. ++ ++ Supports: ++ - Chat format (OpenAI messages): Uses ChatDataset ++ - SFT format (prompt/completion): Uses ColumnMappedTextInstructionDataset ++ - Custom format (via prompt_template): Uses ColumnMappedTextInstructionDataset with custom columns ++ - Embedding format (query/pos_doc/neg_doc): Uses inline retrieval dataset ++ ++ Args: ++ cfg: Configuration dictionary to populate. ++ customizer_config: Training step configuration. ++ prepared: Prepared dataset with merged train/val files. ++ seq_length: Effective sequence length for dataset configuration. ++ When sequence packing is enabled, this is the pack size. ++ Otherwise, this is the model's max_seq_length. ++ seed: Random seed for reproducibility. ++ is_embedding_model: Whether this is an embedding model (for dataset format hints). ++ embedding_config: Embedding model configuration (required for embedding datasets). ++ """ ++ train_file = prepared.train_file ++ validation_file = prepared.validation_file ++ ++ # Detect schema from training data ++ schema, column_keys = detect_dataset_schema( ++ train_file, ++ prompt_template=customizer_config.dataset.prompt_template, ++ ) ++ ++ # Validate that embedding models use embedding datasets and vice versa ++ if is_embedding_model and schema != DatasetSchema.EMBEDDING: ++ raise ValueError( ++ f"Model '{customizer_config.model.name}' is detected as an embedding model but the dataset " ++ f"is in '{schema.value}' format. Embedding models require datasets with 'query', 'pos_doc', " ++ "and 'neg_doc' fields. Please provide a dataset in embedding format." ++ ) ++ if schema == DatasetSchema.EMBEDDING and not is_embedding_model: ++ raise ValueError( ++ f"Dataset is in embedding format (query/pos_doc/neg_doc) but model " ++ f"'{customizer_config.model.name}' is not detected as an embedding model. " ++ "Embedding datasets can only be used with embedding models." ++ ) ++ ++ if schema == DatasetSchema.EMBEDDING: ++ # Embedding/retrieval dataset - uses inline format directly ++ if embedding_config is None: ++ raise ValueError("embedding_config is required for embedding dataset configuration") ++ _configure_embedding_dataset(cfg, customizer_config, train_file, validation_file, seed, embedding_config) ++ elif schema == DatasetSchema.CHAT: ++ # Chat dataset (OpenAI messages format) ++ _configure_chat_dataset(cfg, customizer_config, train_file, validation_file, seq_length) ++ else: ++ # SFT/Custom dataset (prompt/completion or custom columns) ++ assert column_keys is not None, "column_keys must be set for SFT/CUSTOM schema" ++ question_col, answer_col = column_keys ++ _configure_sft_dataset( ++ cfg, ++ customizer_config, ++ train_file, ++ validation_file, ++ question_col, ++ answer_col, ++ seq_length, ++ ) ++ ++ ++def _configure_chat_dataset( ++ cfg: dict[str, Any], ++ customizer_config: TrainingStepConfig, ++ train_file: Path, ++ val_file: Path, ++ seq_length: int, ++) -> None: ++ """Configure ChatDataset for OpenAI messages format.""" ++ logger.info(f"Configuring ChatDataset for chat format data with seq_length={seq_length}") ++ ++ # Resolve chat template using priority-based selection: ++ # 1. Fileset metadata chat_template (from model entity spec, highest priority) ++ # 2. Custom template from DEFAULT_CHAT_TEMPLATES (if model.name matches) ++ # 3. Model's built-in tokenizer template (fallback) ++ chat_template = resolve_chat_template( ++ model_path=customizer_config.model.path, ++ model_name=customizer_config.model.name, ++ user_template=customizer_config.model.chat_template, ++ ) ++ pp_enabled = customizer_config.parallelism.pipeline_parallel_size > 1 ++ # Note: "split" is required by Automodel's pack_dataset() when sequence packing is enabled. ++ # Without it, build_dataloader() raises AttributeError accessing cfg_ds.split. ++ cfg["dataset"] = { ++ "_target_": "nemo_automodel.components.datasets.llm.chat_dataset.ChatDataset", ++ "path_or_dataset_id": str(train_file), ++ "split": "train", ++ "seq_length": seq_length, ++ "padding": "do_not_pad" if not pp_enabled else "max_length", ++ } ++ cfg["validation_dataset"] = { ++ "_target_": "nemo_automodel.components.datasets.llm.chat_dataset.ChatDataset", ++ "path_or_dataset_id": str(val_file), ++ "split": "validation", ++ "seq_length": seq_length, ++ "padding": "do_not_pad" if not pp_enabled else "max_length", ++ } ++ ++ # Add chat template if available ++ if chat_template: ++ cfg["dataset"]["chat_template"] = chat_template ++ cfg["validation_dataset"]["chat_template"] = chat_template ++ logger.info("Added chat template to dataset config") ++ else: ++ logger.warning("No chat template found - ChatDataset may fail") ++ ++ # Store resolved template in config for checkpoint processing ++ # This ensures the same template is used during training and applied to output ++ cfg["_resolved_chat_template"] = chat_template ++ ++ ++def _configure_sft_dataset( ++ cfg: dict[str, Any], ++ customizer_config: TrainingStepConfig, ++ train_file: Path, ++ val_file: Path, ++ question_col: str, ++ answer_col: str, ++ seq_length: int, ++) -> None: ++ """Configure ColumnMappedTextInstructionDataset for SFT/custom format.""" ++ logger.info( ++ f"Configuring SFT dataset with columns: question={question_col}, answer={answer_col}, seq_length={seq_length}" ++ ) ++ pp_enabled = customizer_config.parallelism.pipeline_parallel_size > 1 ++ # Note: "split" is required by Automodel's pack_dataset() when sequence packing is enabled. ++ # Without it, build_dataloader() raises AttributeError accessing cfg_ds.split. ++ cfg["dataset"] = { ++ "_target_": "nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset", ++ "path_or_dataset_id": str(train_file), ++ "split": "train", ++ "column_mapping": { ++ "question": question_col, ++ "answer": answer_col, ++ }, ++ "seq_length": seq_length, ++ "answer_only_loss_mask": True, ++ "padding": "do_not_pad" if not pp_enabled else "max_length", ++ "truncation": "longest_first", ++ } ++ cfg["validation_dataset"] = { ++ "_target_": "nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset", ++ "path_or_dataset_id": str(val_file), ++ "split": "validation", ++ "column_mapping": { ++ "question": question_col, ++ "answer": answer_col, ++ }, ++ "seq_length": seq_length, ++ "answer_only_loss_mask": True, ++ "padding": "do_not_pad" if not pp_enabled else "max_length", ++ "truncation": "longest_first", ++ } ++ ++ ++def _configure_embedding_dataset( ++ cfg: dict[str, Any], ++ customizer_config: TrainingStepConfig, ++ train_file: Path, ++ val_file: Path, ++ seed: int, ++ embedding_config: EmbeddingConfig, ++) -> None: ++ """Configure embedding/retrieval dataset for biencoder training. ++ ++ Uses Automodel's inline retrieval dataset format which directly accepts ++ Customizer's embedding format without conversion: ++ {"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]} ++ ++ This uses retrieval_dataset_inline.make_retrieval_dataset which handles: ++ - Loading inline text directly from JSONL ++ - RetrievalBiencoderCollator for tokenization and batching ++ ++ Args: ++ cfg: Configuration dictionary to populate. ++ customizer_config: Training step configuration. ++ train_file: Path to training JSONL file. ++ val_file: Path to validation JSONL file. ++ seed: Random seed for reproducibility. ++ embedding_config: Embedding model configuration. ++ """ ++ ++ logger.info(f"Configuring embedding dataset with train_n_passages={embedding_config.train_n_passages}") ++ ++ cfg["dataloader"] = { ++ "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", ++ "dataset": { ++ "_target_": "nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset", ++ "data_dir_list": [str(train_file)], ++ "data_type": "train", ++ "train_n_passages": embedding_config.train_n_passages, ++ "seed": seed, ++ "do_shuffle": True, ++ }, ++ "collate_fn": { ++ "_target_": "nemo_automodel.components.datasets.llm.RetrievalBiencoderCollator", ++ "q_max_len": embedding_config.query_max_length, ++ "p_max_len": embedding_config.passage_max_length, ++ "query_prefix": embedding_config.query_prefix, ++ "passage_prefix": embedding_config.passage_prefix, ++ "pad_to_multiple_of": 8, ++ }, ++ "shuffle": True, ++ "num_workers": 0, ++ } ++ ++ if val_file and val_file.exists(): ++ cfg["validation_dataloader"] = { ++ "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", ++ "dataset": { ++ "_target_": "nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset", ++ "data_dir_list": [str(val_file)], ++ "data_type": "eval", ++ "train_n_passages": embedding_config.train_n_passages, ++ "eval_negative_size": get_eval_negative_size(embedding_config), ++ "seed": seed, ++ "do_shuffle": False, ++ }, ++ "collate_fn": { ++ "_target_": "nemo_automodel.components.datasets.llm.RetrievalBiencoderCollator", ++ "q_max_len": embedding_config.query_max_length, ++ "p_max_len": embedding_config.passage_max_length, ++ "query_prefix": embedding_config.query_prefix, ++ "passage_prefix": embedding_config.passage_prefix, ++ "padding": "longest", ++ "pad_to_multiple_of": 8, ++ }, ++ "batch_size": customizer_config.batch.micro_batch_size, ++ "shuffle": False, ++ "num_workers": 0, ++ } ++ ++ ++def _verify_tokenizer_compatibility(student_path: str, teacher_path: str, trust_remote_code: bool = False) -> None: ++ """ ++ Verify that student and teacher models have compatible tokenizers. ++ ++ Knowledge distillation requires the student and teacher to have the same ++ vocabulary so their logit spaces are aligned. This check prevents subtle ++ bugs where training appears to work but produces garbage outputs. ++ ++ Raises: ++ ValueError: If tokenizers are incompatible ++ """ ++ # Import here to avoid ModuleNotFoundError in environments where ++ # transformers is not installed (e.g., during test collection) ++ from transformers import AutoTokenizer ++ ++ try: ++ student_tokenizer = AutoTokenizer.from_pretrained(student_path, trust_remote_code=trust_remote_code) ++ teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_path, trust_remote_code=trust_remote_code) ++ ++ if student_tokenizer.vocab_size != teacher_tokenizer.vocab_size: ++ raise ValueError( ++ f"Tokenizer vocabulary size mismatch: student has {student_tokenizer.vocab_size} tokens, " ++ f"teacher has {teacher_tokenizer.vocab_size} tokens. " ++ "Knowledge distillation requires matching vocabularies." ++ ) ++ ++ # Optional: Could also check for specific token mismatches ++ logger.info(f"Tokenizer compatibility verified: both models have vocab_size={student_tokenizer.vocab_size}") ++ ++ except Exception as e: ++ if "vocabulary size mismatch" in str(e): ++ raise ++ # Log but don't fail for other tokenizer loading issues ++ # (e.g., network issues, missing files) - the training will fail later with a clearer error ++ logger.warning(f"Could not verify tokenizer compatibility: {e}") ++ ++ ++def _configure_kd(cfg: dict[str, Any], customizer_config: TrainingStepConfig, trust_remote_code: bool = False) -> None: ++ """ ++ Configure Knowledge Distillation for Automodel's KD recipe. ++ ++ Automodel's KnowledgeDistillationRecipeForNextTokenPrediction requires: ++ - teacher_model: Frozen teacher model for soft targets ++ - kd_ratio: Balance between CE and KD loss (0=CE only, 1=KD only) ++ - kd_loss_fn: KL-divergence loss with temperature scaling ++ - offload_teacher_model: Optional CPU offloading for memory efficiency ++ """ ++ kd_config = customizer_config.training.kd ++ if not kd_config or not kd_config.teacher_model: ++ raise ValueError( ++ "Knowledge distillation requires training.kd.teacher to be set. " ++ "Ensure the job input includes a teacher model." ++ ) ++ ++ # Verify tokenizer compatibility before proceeding ++ _verify_tokenizer_compatibility( ++ customizer_config.model.path, ++ kd_config.teacher_model.path, ++ trust_remote_code=trust_remote_code, ++ ) ++ ++ # Teacher model (frozen, same architecture loading as student) ++ # Use teacher's precision if specified, otherwise fall back to student's precision ++ teacher_precision = kd_config.teacher_model.precision or customizer_config.model.precision ++ cfg["teacher_model"] = { ++ "_target_": "nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained", ++ "pretrained_model_name_or_path": kd_config.teacher_model.path, ++ "torch_dtype": teacher_precision.to_torch_dtype() if teacher_precision else "auto", ++ "attn_implementation": kd_config.teacher_model.attn_implementation, ++ "trust_remote_code": kd_config.teacher_model.trust_remote_code, ++ } ++ ++ # KD loss function with temperature ++ cfg["kd_loss_fn"] = { ++ "_target_": "nemo_automodel.components.loss.kd_loss.KDLoss", ++ "ignore_index": -100, ++ "temperature": kd_config.temperature, ++ "fp32_upcast": True, # Recommended for numerical stability ++ } ++ ++ # KD ratio (blend between CE and KD loss) ++ cfg["kd_ratio"] = kd_config.ratio ++ ++ # Optional: Offload teacher to CPU for memory efficiency ++ if kd_config.offload_teacher: ++ cfg["offload_teacher_model"] = True ++ logger.info("Teacher model will be offloaded to CPU between forward passes") ++ ++ ++def get_eval_negative_size(embedding_config: EmbeddingConfig) -> int: ++ """Get the effective eval_negative_size value from embedding config. ++ ++ Returns the user-specified eval_negative_size if set, otherwise defaults ++ to train_n_passages - 1 for consistent train/eval behavior. ++ ++ The -1 relationship exists because: ++ - train_n_passages = total passages = 1 positive + N negatives ++ - eval_negative_size = just the negative count = N ++ - So: eval_negative_size = train_n_passages - 1 (subtracting the positive) ++ ++ Example: train_n_passages=5 (1 pos + 4 neg) -> eval_negative_size=4 ++ """ ++ if embedding_config.eval_negative_size is not None: ++ return embedding_config.eval_negative_size ++ return embedding_config.train_n_passages - 1 +diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/finetune.py b/services/automodel/src/nmp/automodel/tasks/training/backends/finetune.py +new file mode 100644 +index 000000000..abaf469ba +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/backends/finetune.py +@@ -0,0 +1,260 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Automodel training subprocess entry point. ++ ++Wraps nemo_automodel recipes with Jobs-service progress reporting (SFT, KD, embedding). ++""" ++ ++from __future__ import annotations ++ ++import logging ++from typing import Any, Protocol, runtime_checkable ++ ++from nemo_automodel.components.checkpoint.checkpointing import Checkpointer ++from nemo_automodel.components.config._arg_parser import parse_args_and_load_config ++from nemo_automodel.components.training.step_scheduler import StepScheduler ++from nemo_automodel.recipes.biencoder.train_biencoder import TrainBiencoderRecipe ++from nemo_automodel.recipes.llm.kd import KnowledgeDistillationRecipeForNextTokenPrediction ++from nemo_automodel.recipes.llm.train_ft import TrainFinetuneRecipeForNextTokenPrediction ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nmp.automodel.tasks.training.backends.callbacks import TrainingProgressCallback ++from nmp.automodel.tasks.training.progress import JobsServiceProgressReporter ++ ++logger = logging.getLogger(__name__) ++ ++ ++@runtime_checkable ++class AutomodelRecipe(Protocol): ++ """Protocol defining the interface we need from Automodel recipes. ++ ++ This makes the dependencies explicit and enables type checking, unlike ++ the previous mixin approach that relied on implicit attributes. ++ """ ++ ++ cfg: Any ++ step_scheduler: StepScheduler ++ checkpointer: Checkpointer ++ dist_env: Any ++ ++ def setup(self) -> None: ++ """Build all components needed for training.""" ++ ... ++ ++ def run_train_validation_loop(self) -> None: ++ """Run the main training/validation loop.""" ++ ... ++ ++ def log_train_metrics(self, log_data: Any) -> None: ++ """Log training metrics.""" ++ ... ++ ++ def log_val_metrics(self, *args: Any, **kwargs: Any) -> None: ++ """Log validation metrics. ++ ++ Note: Signature varies across Automodel recipes: ++ - LLM/KD: (val_name, log_data, metric_logger=None) ++ - VLM/biencoder/seq_cls: (log_data) ++ """ ++ ... ++ ++ def save_checkpoint( ++ self, ++ epoch: int, ++ step: int, ++ train_loss: float, ++ val_loss: dict[str, float] | None = None, ++ best_metric_key: str = "default", ++ ) -> None: ++ """Save a checkpoint.""" ++ ... ++ ++ ++class AutomodelRecipeWrapper: ++ """Wraps an Automodel recipe with Jobs-service progress reporting.""" ++ ++ def __init__(self, recipe: AutomodelRecipe, job_ctx: NMPJobContext | None = None): ++ """Initialize the wrapper with an Automodel recipe. ++ ++ Args: ++ recipe: Any recipe implementing the AutomodelRecipe protocol ++ (SFT, KD, biencoder, etc.). ++ job_ctx: NeMo Platform job context for progress reporting (optional, ++ defaults to environment variables). ++ """ ++ self._job_ctx = job_ctx or NMPJobContext.from_env() ++ self._reporter = JobsServiceProgressReporter(self._job_ctx) ++ self._reporter.report_running("automodel_recipe_setup") ++ ++ self._recipe = recipe ++ self._recipe.setup() ++ ++ self.max_steps = getattr(self._recipe.step_scheduler, "max_steps", None) or 100 ++ self.num_epochs = getattr(self._recipe.step_scheduler, "num_epochs", None) or 1 ++ ++ self.callback = TrainingProgressCallback(self._reporter) ++ logger.info(f"Automodel recipe wrapper initialized: max_steps={self.max_steps}, num_epochs={self.num_epochs}") ++ ++ # Store original methods before patching ++ self._original_log_train_metrics = recipe.log_train_metrics ++ self._original_log_val_metrics = recipe.log_val_metrics ++ self._original_save_checkpoint = recipe.save_checkpoint ++ ++ # Monkey-patch the recipe's methods to add our callbacks ++ recipe.log_train_metrics = self._log_train_metrics # type: ignore[method-assign] ++ recipe.log_val_metrics = self._log_val_metrics # type: ignore[method-assign] ++ recipe.save_checkpoint = self._save_checkpoint # type: ignore[method-assign] ++ ++ @property ++ def recipe(self) -> AutomodelRecipe: ++ """Access the underlying recipe.""" ++ return self._recipe ++ ++ def run_train_validation_loop(self) -> None: ++ """Run training and close the progress callback.""" ++ try: ++ self.callback.report_training_start(self.max_steps, self.num_epochs) ++ self._recipe.run_train_validation_loop() ++ finally: ++ if self.callback: ++ self.callback.close() ++ logger.info("Training progress callback closed") ++ ++ def _log_train_metrics(self, log_data: Any) -> None: ++ """Wrapped log_train_metrics with Jobs-service reporting.""" ++ self._original_log_train_metrics(log_data) ++ if self.callback and log_data: ++ try: ++ metrics = getattr(log_data, "metrics", {}) ++ self.callback.report_train_step( ++ step=getattr(log_data, "step", 0) + 1, # Convert to 1-based ++ epoch=getattr(log_data, "epoch", 0) + 1, # Convert to 1-based ++ loss=metrics.get("loss", 0.0), ++ lr=metrics.get("lr"), ++ grad_norm=metrics.get("grad_norm"), ++ ) ++ except Exception as e: ++ logger.warning(f"Failed to report training progress: {e}") ++ ++ try: ++ if self._recipe.step_scheduler.is_last_batch: ++ self.callback.report_epoch_end( ++ step=self._recipe.step_scheduler.step + 1, ++ epoch=self._recipe.step_scheduler.epoch + 1, ++ ) ++ except Exception as e: ++ logger.warning(f"Failed to report epoch end: {e}") ++ ++ def _log_val_metrics(self, *args: Any, **kwargs: Any) -> None: ++ """Wrapped log_val_metrics with Jobs-service reporting. ++ ++ Handles different Automodel recipe signatures: ++ - LLM/KD: (val_name, log_data, metric_logger=None) ++ - VLM/biencoder/seq_cls: (log_data) ++ """ ++ # Call original method first with whatever args were passed ++ self._original_log_val_metrics(*args, **kwargs) ++ ++ # Extract log_data from args (it's always the last positional arg before kwargs) ++ # LLM signature: (val_name, log_data, metric_logger=None) -> log_data is args[1] ++ # VLM/biencoder signature: (log_data) -> log_data is args[0] ++ log_data = None ++ if len(args) >= 2: ++ # LLM/KD style: (val_name, log_data, ...) ++ log_data = args[1] ++ elif len(args) == 1: ++ # VLM/biencoder style: (log_data) ++ log_data = args[0] ++ ++ if self.callback and log_data: ++ try: ++ metrics = getattr(log_data, "metrics", {}) ++ self.callback.report_validation( ++ step=getattr(log_data, "step", 0) + 1, # Convert to 1-based ++ epoch=getattr(log_data, "epoch", 0) + 1, # Convert to 1-based ++ val_loss=metrics.get("val_loss", 0.0), ++ ) ++ except Exception as e: ++ logger.warning(f"Failed to report validation progress: {e}") ++ ++ def _save_checkpoint( ++ self, ++ epoch: int, ++ step: int, ++ train_loss: float, ++ val_loss: dict[str, float] | None = None, ++ best_metric_key: str = "default", ++ ) -> None: ++ """Wrapped save_checkpoint with Jobs-service reporting.""" ++ self._original_save_checkpoint(epoch, step, train_loss, val_loss, best_metric_key) ++ if self.callback: ++ try: ++ checkpoint_dir = getattr( ++ getattr(self._recipe.checkpointer, "config", None), ++ "checkpoint_dir", ++ None, ++ ) ++ self.callback.report_checkpoint_saved( ++ step=step + 1, # Convert to 1-based ++ epoch=epoch + 1, # Convert to 1-based ++ checkpoint_path=str(checkpoint_dir) if checkpoint_dir else None, ++ ) ++ except Exception as e: ++ logger.warning(f"Failed to report checkpoint save: {e}") ++ ++ ++def _is_kd_config(cfg: Any) -> bool: ++ """Check if config is for knowledge distillation.""" ++ return cfg.get("teacher_model") is not None or cfg.get("kd_ratio") is not None ++ ++ ++def _is_biencoder_config(cfg: Any) -> bool: ++ """Check if config is for biencoder/embedding model training. ++ ++ Detects biencoder configs by checking if model._target_ contains 'biencoder'. ++ ++ Note: ConfigNode automatically resolves _target_ to the actual function/class, ++ so we check the function's __module__ or __qualname__ for 'biencoder'. ++ """ ++ try: ++ model_cfg = cfg.get("model", {}) ++ if model_cfg is None: ++ return False ++ ++ target = model_cfg.get("_target_") ++ if target is None: ++ return False ++ ++ # target is resolved to the actual function/class by ConfigNode ++ # Check its module path or qualified name ++ module = getattr(target, "__module__", "") or "" ++ qualname = getattr(target, "__qualname__", "") or "" ++ return "biencoder" in module.lower() or "biencoder" in qualname.lower() ++ except (AttributeError, TypeError): ++ return False ++ ++ ++def create_automodel_recipe(cfg: Any) -> AutomodelRecipeWrapper: ++ """Create a progress-reporting wrapper for the recipe implied by *cfg*.""" ++ if _is_biencoder_config(cfg): ++ logger.info("Detected biencoder config, using embedding model recipe") ++ base_recipe = TrainBiencoderRecipe(cfg) ++ elif _is_kd_config(cfg): ++ logger.info("Detected Knowledge Distillation config, using KD recipe") ++ base_recipe = KnowledgeDistillationRecipeForNextTokenPrediction(cfg) ++ else: ++ logger.info("Using SFT fine-tuning recipe") ++ base_recipe = TrainFinetuneRecipeForNextTokenPrediction(cfg) ++ ++ return AutomodelRecipeWrapper(base_recipe) ++ ++ ++def main() -> None: ++ cfg = parse_args_and_load_config() ++ recipe = create_automodel_recipe(cfg) ++ recipe.run_train_validation_loop() ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/requirements.txt b/services/automodel/src/nmp/automodel/tasks/training/backends/requirements.txt +new file mode 100644 +index 000000000..1ac43ef3a +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/backends/requirements.txt +@@ -0,0 +1 @@ ++nemo_automodel==0.2.0 +diff --git a/services/automodel/src/nmp/automodel/tasks/training/chat_templates.py b/services/automodel/src/nmp/automodel/tasks/training/chat_templates.py +new file mode 100644 +index 000000000..854823e15 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/chat_templates.py +@@ -0,0 +1,196 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Chat template resolution and application for training backends. ++ ++This module provides: ++1. Priority-based chat template selection (resolve_chat_template) ++2. Applying chat templates to output checkpoints (apply_chat_template_to_checkpoint) ++ ++Chat template priority order: ++1. User-provided template (via API) ++2. Custom template from DEFAULT_CHAT_TEMPLATES map (enhanced for tool calling) ++3. Model's built-in tokenizer template (fallback) ++ ++The custom templates in the templates/ directory extend base model templates with: ++- Tool calling support: , , formatting ++- Generation markers: {% generation %}...{% endgeneration %} blocks for loss masking ++- Enhanced compatibility across models ++""" ++ ++import json ++import logging ++from pathlib import Path ++ ++logger = logging.getLogger(__name__) ++ ++# Directory containing custom chat template jinja files ++TEMPLATES_DIR = Path(__file__).parent / "templates" ++ ++# ============================================================================ ++# Model Name Constants ++# ============================================================================ ++ ++# Meta Llama models ++META_LLAMA_31_8B_INSTRUCT = "meta/llama-3.1-8b-instruct" ++META_LLAMA_31_70B_INSTRUCT = "meta/llama-3.1-70b-instruct" ++META_LLAMA_31_405B_INSTRUCT = "meta/llama-3.1-405b-instruct" ++META_LLAMA_32_1B = "meta/llama-3.2-1b" ++META_LLAMA_32_1B_INSTRUCT = "meta/llama-3.2-1b-instruct" ++META_LLAMA_32_3B_INSTRUCT = "meta/llama-3.2-3b-instruct" ++META_LLAMA_33_70B_INSTRUCT = "meta/llama-3.3-70b-instruct" ++# NVIDIA Nemotron models ++NVIDIA_NEMOTRON_31_8B = "nvidia/nemotron-nano-llama-3.1-8b" ++NVIDIA_NEMOTRON_31_70B = "nvidia/nemotron-llama-3.1-70b" ++NVIDIA_NEMOTRON_33_49B = "nvidia/nemotron-super-llama-3.3-49b" ++NVIDIA_NEMOTRON_33_49B_V1_5 = "nvidia/nemotron-super-llama-3.3-49b-v1.5" ++# NIM model names (alternative naming) ++NIM_NVIDIA_NEMOTRON_31_8B = "nvidia/llama-3.1-nemotron-nano-8b-v1" ++NIM_NVIDIA_NEMOTRON_31_70B = "nvidia/llama-3.1-nemotron-70b-instruct" ++NIM_NVIDIA_NEMOTRON_33_49B = "nvidia/llama-3.3-nemotron-super-49b-v1" ++NIM_NVIDIA_NEMOTRON_33_49B_V1_5 = "nvidia/llama-3.3-nemotron-super-49b-v1.5" ++# Microsoft models ++PHI_4 = "microsoft/phi-4" ++ ++# ============================================================================ ++# Default Chat Templates Map ++# ============================================================================ ++ ++# Maps model names to custom jinja template filenames. ++# These templates extend the base model templates with: ++# - Tool calling support ++# - Generation markers for loss masking ++# - Enhanced compatibility ++DEFAULT_CHAT_TEMPLATES: dict[str, str] = { ++ # Llama 3.1 family ++ META_LLAMA_31_8B_INSTRUCT: "llama-3.1-instruct.jinja", ++ META_LLAMA_31_70B_INSTRUCT: "llama-3.1-instruct.jinja", ++ META_LLAMA_31_405B_INSTRUCT: "llama-3.1-instruct.jinja", ++ # Llama 3.2 family ++ META_LLAMA_32_1B: "llama-3.2-instruct.jinja", ++ META_LLAMA_32_1B_INSTRUCT: "llama-3.2-instruct.jinja", ++ META_LLAMA_32_3B_INSTRUCT: "llama-3.2-instruct.jinja", ++ # Llama 3.3 family ++ META_LLAMA_33_70B_INSTRUCT: "llama-3.3-instruct.jinja", ++ # Nemotron family ++ NVIDIA_NEMOTRON_31_8B: "nemotron-3.1.jinja", ++ NVIDIA_NEMOTRON_31_70B: "nemotron-3.1.jinja", ++ NVIDIA_NEMOTRON_33_49B: "nemotron-super-3.3.jinja", ++ NVIDIA_NEMOTRON_33_49B_V1_5: "nemotron-super-3.3.jinja", ++ # NIM Nemotron (alternative naming) ++ NIM_NVIDIA_NEMOTRON_31_8B: "nemotron-3.1.jinja", ++ NIM_NVIDIA_NEMOTRON_31_70B: "nemotron-3.1.jinja", ++ NIM_NVIDIA_NEMOTRON_33_49B: "nemotron-super-3.3.jinja", ++ NIM_NVIDIA_NEMOTRON_33_49B_V1_5: "nemotron-super-3.3.jinja", ++ # Microsoft ++ PHI_4: "phi-4.jinja", ++} ++ ++ ++def _load_template_file(template_filename: str) -> str | None: ++ """Load a custom template from the templates directory.""" ++ template_path = TEMPLATES_DIR / template_filename ++ if template_path.exists(): ++ with open(template_path, "r", encoding="utf-8") as f: ++ return f.read() ++ logger.warning(f"Template file not found: {template_path}") ++ return None ++ ++ ++def _get_tokenizer_chat_template(model_path: str) -> str | None: ++ """ ++ Get chat template from model's tokenizer. ++ ++ Uses AutoTokenizer which handles all model formats (HF, NeMo, custom). ++ """ ++ try: ++ from transformers import AutoTokenizer ++ ++ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) ++ template = getattr(tokenizer, "chat_template", None) ++ if template: ++ logger.debug(f"Found chat template in tokenizer for {model_path}") ++ return template ++ except Exception as e: ++ logger.warning(f"Could not load tokenizer to get chat template: {e}") ++ return None ++ ++ ++def resolve_chat_template( ++ model_path: str, ++ model_name: str | None = None, ++ user_template: str | None = None, ++) -> str | None: ++ """ ++ Resolve chat template using priority-based selection. ++ ++ Priority order: ++ 1. User-provided template (highest priority) ++ 2. Custom template from DEFAULT_CHAT_TEMPLATES (if model_name matches) ++ 3. Model's built-in tokenizer template (fallback) ++ ++ Args: ++ model_path: Path to the model directory (for tokenizer fallback). ++ model_name: Canonical model name (e.g., "meta/llama-3.1-8b-instruct"). ++ Used to look up custom templates. ++ user_template: User-provided template string (takes highest priority). ++ ++ Returns: ++ The resolved chat template string, or None if no template found. ++ """ ++ # Priority 1: User-provided template ++ if user_template: ++ logger.info("Using user-provided chat template") ++ return user_template ++ ++ # Priority 2: Custom template from DEFAULT_CHAT_TEMPLATES ++ if model_name and model_name in DEFAULT_CHAT_TEMPLATES: ++ template_filename = DEFAULT_CHAT_TEMPLATES[model_name] ++ template = _load_template_file(template_filename) ++ if template: ++ logger.info(f"Using custom chat template for {model_name}: {template_filename}") ++ return template ++ ++ # Priority 3: Model's built-in tokenizer template ++ template = _get_tokenizer_chat_template(model_path) ++ if template: ++ logger.info(f"Using model's built-in chat template from {model_path}") ++ return template ++ ++ logger.warning(f"No chat template found for model_name={model_name}, model_path={model_path}") ++ return None ++ ++ ++def apply_chat_template_to_checkpoint( ++ output_path: Path, ++ chat_template: str | None, ++) -> None: ++ """ ++ Apply chat template to the output checkpoint's tokenizer_config.json. ++ ++ Also ensures pad_token is set if missing (uses eos_token as fallback), ++ which is required by many inference frameworks. ++ ++ Args: ++ output_path: Path to the checkpoint directory containing tokenizer_config.json. ++ chat_template: The chat template string to apply. If None, skips application. ++ """ ++ if not chat_template: ++ logger.warning("No chat template provided, skipping") ++ return ++ ++ tokenizer_config = output_path / "tokenizer_config.json" ++ if not tokenizer_config.exists(): ++ logger.warning(f"tokenizer_config.json not found at {output_path}") ++ return ++ ++ with open(tokenizer_config, "r") as f: ++ config = json.load(f) ++ ++ config["chat_template"] = chat_template ++ ++ with open(tokenizer_config, "w") as f: ++ json.dump(config, f, indent=2) ++ ++ logger.info("Applied chat template to output checkpoint") +diff --git a/services/automodel/src/nmp/automodel/tasks/training/datasets/preparation.py b/services/automodel/src/nmp/automodel/tasks/training/datasets/preparation.py +new file mode 100644 +index 000000000..63473c5d9 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/datasets/preparation.py +@@ -0,0 +1,509 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++# ++# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual ++# property and proprietary rights in and to this material, related ++# documentation and any modifications thereto. Any use, reproduction, ++# disclosure or distribution of this material and related documentation ++# without an express license agreement from NVIDIA CORPORATION or ++# its affiliates is strictly prohibited. ++ ++"""Dataset discovery, merge/split, and schedule helpers for Automodel training.""" ++ ++import json ++import logging ++import random ++import re ++import shutil ++import subprocess ++from dataclasses import dataclass ++from enum import Enum ++from pathlib import Path ++from typing import Any, Optional, Union ++ ++from nmp.automodel.app.constants import DEFAULT_SEED ++ ++logger = logging.getLogger(__name__) ++ ++# Dataset directory constants for merged files (we control this structure) ++MERGED_DIR = "merged" ++TRAIN_FILE = "train.jsonl" ++VAL_FILE = "validation.jsonl" ++ ++# Heuristic patterns for discovering training files ++TRAIN_PATTERNS = [ ++ "train*.jsonl", ++ "training*.jsonl", ++ "train*.json", ++ "training*.json", ++] ++TRAIN_DIRS = ["train", "training"] ++ ++# Heuristic patterns for discovering validation files ++VAL_PATTERNS = [ ++ "val*.jsonl", ++ "validation*.jsonl", ++ "val*.json", ++ "validation*.json", ++ "dev*.jsonl", ++ "dev*.json", ++] ++VAL_DIRS = ["val", "validation", "dev"] ++ ++ ++class DatasetSchema(str, Enum): ++ """Detected dataset schema type.""" ++ ++ CHAT = "chat" # OpenAI messages format: {"messages": [...]} ++ SFT = "sft" # Prompt/completion: {"prompt": ..., "completion": ...} ++ CUSTOM = "custom" # Custom columns via prompt_template ++ EMBEDDING = "embedding" # Retrieval format: {"query": ..., "pos_doc": ..., "neg_doc": [...]} ++ ++ ++class DatasetFormatError(Exception): ++ """Raised when dataset format is invalid or unsupported.""" ++ ++ pass ++ ++ ++def detect_dataset_schema( ++ file_path: Path, ++ prompt_template: str | None = None, ++) -> tuple[DatasetSchema, tuple[str, ...] | None]: ++ """ ++ Detect dataset schema by sampling the first line. ++ ++ Supports four formats: ++ 1. Chat format: {"messages": [{"role": "user", ...}, {"role": "assistant", ...}]} ++ 2. Embedding format: {"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]} ++ 3. SFT format: {"prompt": "...", "completion": "..."} ++ 4. Custom format: Any two-column format specified via prompt_template like "{input} {output}" ++ ++ Args: ++ file_path: Path to the JSONL dataset file. ++ prompt_template: Optional template string with two placeholders like "{input} {output}". ++ ++ Returns: ++ Tuple of (schema_type, column_keys) where: ++ - CHAT: column_keys is None ++ - EMBEDDING: column_keys is ("query", "pos_doc", "neg_doc") ++ - SFT/CUSTOM: column_keys is (question_col, answer_col) ++ ++ Raises: ++ DatasetFormatError: If the dataset format cannot be detected or is invalid. ++ """ ++ with open(file_path, "r", encoding="utf-8") as f: ++ line = f.readline() ++ ++ try: ++ obj: dict[str, Any] = json.loads(line) ++ except json.JSONDecodeError as e: ++ raise DatasetFormatError(f"Invalid JSON in {file_path}: {e}") ++ ++ # Check for chat format (OpenAI messages) ++ if "messages" in obj and isinstance(obj["messages"], list): ++ if len(obj["messages"]) > 0 and isinstance(obj["messages"][0], dict): ++ if "role" in obj["messages"][0]: ++ logger.info(f"Detected chat dataset format in {file_path}") ++ return DatasetSchema.CHAT, None ++ ++ # Check for embedding/retrieval format ++ # Format: {"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]} ++ if "query" in obj and "pos_doc" in obj and "neg_doc" in obj: ++ if isinstance(obj["query"], str) and isinstance(obj["pos_doc"], str) and isinstance(obj["neg_doc"], list): ++ logger.info(f"Detected embedding/retrieval dataset format in {file_path}") ++ return DatasetSchema.EMBEDDING, ("query", "pos_doc", "neg_doc") ++ ++ # Check for custom prompt_template format ++ if prompt_template: ++ keys = re.findall(r"\{(.*?)\}", prompt_template) ++ if len(keys) == 2: ++ # Validate keys exist in data ++ if all(k in obj for k in keys): ++ logger.info(f"Detected custom template format with keys {keys}") ++ return DatasetSchema.CUSTOM, (keys[0], keys[1]) ++ else: ++ raise DatasetFormatError( ++ f"prompt_template keys {keys} not found in dataset. Available keys: {list(obj.keys())}" ++ ) ++ else: ++ raise DatasetFormatError(f"prompt_template must have exactly 2 placeholders, got: {prompt_template}") ++ ++ # Check for standard SFT format (prompt/completion) ++ if "prompt" in obj and "completion" in obj: ++ logger.info(f"Detected SFT (prompt/completion) format in {file_path}") ++ return DatasetSchema.SFT, ("prompt", "completion") ++ ++ # Fallback - try to find any two string columns ++ string_cols = [k for k, v in obj.items() if isinstance(v, str)] ++ if len(string_cols) >= 2: ++ logger.warning(f"Could not detect standard format, using first two string columns: {string_cols[:2]}") ++ return DatasetSchema.SFT, (string_cols[0], string_cols[1]) ++ ++ raise DatasetFormatError( ++ f"Could not detect dataset format. Expected 'messages' (chat) or " ++ f"'prompt'/'completion' (SFT) columns. Found: {list(obj.keys())}" ++ ) ++ ++ ++def _count_jsonl_samples_python(file_path: Path) -> int: ++ """Pure Python implementation of line counting (fallback).""" ++ count = 0 ++ with open(file_path, "r", encoding="utf-8") as f: ++ for line in f: ++ if line.strip(): # Non-empty line ++ count += 1 ++ return count ++ ++ ++def count_jsonl_samples(file_path: Path) -> int: ++ """ ++ Count the number of non-empty lines in a JSONL file. ++ ++ Uses grep for efficiency with large files when available, ++ falls back to pure Python implementation otherwise. ++ ++ Args: ++ file_path: Path to the JSONL file. ++ ++ Returns: ++ Number of non-empty lines (samples) in the file. ++ """ ++ # Check if grep is available ++ if shutil.which("grep") is None: ++ return _count_jsonl_samples_python(file_path) ++ ++ try: ++ # Use `grep -c "\S"` to count non-empty lines (excludes trailing empty lines) ++ result = subprocess.check_output(["grep", "-c", r"\S", str(file_path)], text=True) ++ return int(result.strip()) ++ except subprocess.CalledProcessError: ++ # grep returns exit code 1 if no matches (empty file) ++ return 0 ++ except OSError: ++ # Fallback if subprocess fails for any reason ++ return _count_jsonl_samples_python(file_path) ++ ++ ++def compute_val_check_interval( ++ steps_per_epoch: int, ++ max_steps: int, ++ val_check_interval: Optional[Union[int, float]] = None, ++) -> int: ++ """ ++ Compute how often to run validation (in steps). ++ ++ This handles the semantic difference between: ++ - float <= 1.0: Fraction of epoch (e.g., 0.5 = validate at 50% of each epoch) ++ - int or float > 1.0: Absolute step count ++ ++ Args: ++ steps_per_epoch: Number of gradient steps per epoch. ++ max_steps: Maximum training steps. ++ val_check_interval: User-provided interval (float for fraction, int for steps). ++ ++ Returns: ++ Integer step count for validation interval. ++ ++ Raises: ++ ValueError: If val_check_interval is negative. ++ """ ++ effective_steps = min(steps_per_epoch, max_steps) ++ ++ if val_check_interval is None or val_check_interval == 0: ++ # Default: validate once per epoch (or at end if max_steps < steps_per_epoch) ++ return effective_steps ++ ++ if val_check_interval < 0: ++ raise ValueError("val_check_interval cannot be negative") ++ ++ # Float <= 1.0: interpret as fraction of epoch ++ if isinstance(val_check_interval, float) and val_check_interval <= 1.0: ++ interval = max(1, int(val_check_interval * steps_per_epoch)) ++ else: ++ # Integer or float > 1.0: treat as absolute step count ++ interval = int(val_check_interval) ++ ++ # Cap at effective_steps ++ interval = min(interval, effective_steps) ++ ++ # Ensure validation happens at least once before training ends ++ if interval >= max_steps: ++ interval = max(1, max_steps - 1) ++ ++ return interval ++ ++ ++@dataclass ++class PreparedDataset: ++ """Result of dataset preparation.""" ++ ++ merged_dir: Path ++ train_file: Path ++ validation_file: Path ++ train_samples: int ++ validation_samples: int ++ ++ ++def _discover_files_by_patterns(base_path: Path, patterns: list[str], dirs: list[str]) -> list[Path]: ++ """ ++ Discover files matching patterns or in specific directories. ++ ++ Searches for: ++ 1. Files matching glob patterns in base_path ++ 2. All .jsonl/.json files in specified subdirectories ++ ++ Args: ++ base_path: Root directory to search. ++ patterns: Glob patterns to match (e.g., ["train*.jsonl"]). ++ dirs: Subdirectory names to search (e.g., ["train", "training"]). ++ ++ Returns: ++ Sorted list of discovered file paths. ++ """ ++ files: set[Path] = set() ++ ++ # Pattern matching in base directory ++ for pattern in patterns: ++ for match in base_path.glob(pattern): ++ if match.is_file(): ++ files.add(match.resolve()) ++ ++ # Files in subdirectories ++ for dir_name in dirs: ++ subdir = base_path / dir_name ++ if subdir.is_dir(): ++ for f in subdir.iterdir(): ++ if f.is_file() and f.suffix.lower() in (".jsonl", ".json"): ++ files.add(f.resolve()) ++ ++ return sorted(files) # Sorted for deterministic ordering ++ ++ ++def discover_dataset_files(dataset_path: Path) -> tuple[list[Path], list[Path]]: ++ """ ++ Discover training and validation files using heuristics. ++ ++ Heuristics applied (in order): ++ 1. Files matching train*/training* patterns → training ++ 2. Files in train/ or training/ directories → training ++ 3. Files matching val*/validation*/dev* patterns → validation ++ 4. Files in val/, validation/, or dev/ directories → validation ++ 5. If only one .jsonl file found → treat as training (will auto-split) ++ ++ Args: ++ dataset_path: Path to the dataset directory. ++ ++ Returns: ++ Tuple of (training_files, validation_files). ++ ++ Raises: ++ DatasetFormatError: If no training files can be found. ++ """ ++ dataset_path = Path(dataset_path).resolve() ++ ++ if not dataset_path.exists(): ++ raise DatasetFormatError(f"Dataset path does not exist: {dataset_path}") ++ ++ # If path is a file, treat it as the training file ++ if dataset_path.is_file(): ++ logger.info(f"Dataset path is a file, treating as training data: {dataset_path}") ++ return [dataset_path], [] ++ ++ # Discover training files ++ train_files = _discover_files_by_patterns(dataset_path, TRAIN_PATTERNS, TRAIN_DIRS) ++ ++ # Discover validation files ++ val_files = _discover_files_by_patterns(dataset_path, VAL_PATTERNS, VAL_DIRS) ++ ++ # Fallback: if no files found with patterns, check for any .jsonl files ++ if not train_files and not val_files: ++ all_jsonl = sorted(f for f in dataset_path.glob("*.jsonl") if f.is_file()) ++ if len(all_jsonl) == 1: ++ logger.info(f"Found single JSONL file, treating as training data: {all_jsonl[0]}") ++ train_files = all_jsonl ++ elif len(all_jsonl) > 1: ++ # Ambiguous - could be train/val or multiple training files ++ logger.warning( ++ f"Found {len(all_jsonl)} JSONL files without clear train/val naming. " ++ f"Treating all as training data: {[f.name for f in all_jsonl]}" ++ ) ++ train_files = all_jsonl ++ ++ if not train_files: ++ raise DatasetFormatError( ++ f"No training files found in {dataset_path}. " ++ f"Expected files matching patterns like train*.jsonl or a train/ directory." ++ ) ++ ++ logger.info(f"Discovered {len(train_files)} training file(s): {[f.name for f in train_files]}") ++ if val_files: ++ logger.info(f"Discovered {len(val_files)} validation file(s): {[f.name for f in val_files]}") ++ else: ++ logger.info("No validation files found - will auto-split from training data") ++ ++ return train_files, val_files ++ ++ ++def _merge_files(files: list[Path], output_file: Path) -> int: ++ """ ++ Merge multiple JSONL files into a single file. ++ ++ Args: ++ files: List of files to merge. ++ output_file: Output file path. ++ ++ Returns: ++ Total number of samples (non-empty lines) in merged file. ++ """ ++ output_file.parent.mkdir(parents=True, exist_ok=True) ++ ++ with open(output_file, "w", encoding="utf-8") as out: ++ for f in files: ++ with open(f, "r", encoding="utf-8") as inp: ++ content = inp.read() ++ out.write(content) ++ # Ensure newline between files ++ if content and not content.endswith("\n"): ++ out.write("\n") ++ ++ return count_jsonl_samples(output_file) ++ ++ ++def _create_val_split( ++ train_file: Path, ++ output_train: Path, ++ output_val: Path, ++ val_ratio: float = 0.1, ++ seed: int = DEFAULT_SEED, ++) -> tuple[int, int]: ++ """ ++ Split a training file into train and validation sets. ++ ++ Args: ++ train_file: Source training file. ++ output_train: Output path for training split. ++ output_val: Output path for validation split. ++ val_ratio: Fraction of data to use for validation (default: 10%). ++ seed: Random seed for reproducible splits (default: 1111). ++ ++ Returns: ++ Tuple of (train_samples, validation_samples). ++ """ ++ with open(train_file, "r", encoding="utf-8") as f: ++ lines = [line for line in f if line.strip()] ++ ++ # Shuffle for reproducibility (important for multi-node!) ++ # Uses global seed if not explicitly provided ++ random.seed(seed) ++ random.shuffle(lines) ++ ++ val_size = max(1, int(len(lines) * val_ratio)) ++ val_lines = lines[:val_size] ++ train_lines = lines[val_size:] ++ ++ output_train.parent.mkdir(parents=True, exist_ok=True) ++ output_val.parent.mkdir(parents=True, exist_ok=True) ++ ++ with open(output_train, "w", encoding="utf-8") as f: ++ for line in train_lines: ++ # Re-serialize to ensure valid JSON and consistent formatting ++ f.write(json.dumps(json.loads(line)) + "\n") ++ ++ with open(output_val, "w", encoding="utf-8") as f: ++ for line in val_lines: ++ f.write(json.dumps(json.loads(line)) + "\n") ++ ++ logger.info( ++ f"Created validation split: {len(train_lines)} train samples, {len(val_lines)} val samples " ++ f"({val_ratio:.0%} split)" ++ ) ++ ++ return len(train_lines), len(val_lines) ++ ++ ++def prepare_dataset( ++ dataset_path: Path, ++ output_dir: Optional[Path] = None, ++ val_split_ratio: float = 0.1, ++ seed: int = DEFAULT_SEED, ++) -> PreparedDataset: ++ """ ++ Prepare dataset for training by discovering, merging, and optionally splitting files. ++ ++ This function: ++ 1. Discovers training and validation files using heuristics ++ 2. Merges multiple files into single train.jsonl and val.jsonl ++ 3. Auto-creates validation split if no validation files found ++ 4. Returns paths to the prepared files ++ ++ Args: ++ dataset_path: Path to the dataset directory or file. ++ output_dir: Directory for merged output (default: dataset_path/merged). ++ val_split_ratio: Fraction for auto-split if no validation data (default: 0.1). ++ seed: Random seed for reproducible validation splits (default: 1111). ++ ++ Returns: ++ PreparedDataset with paths to merged files and sample counts. ++ ++ Raises: ++ DatasetFormatError: If dataset cannot be prepared. ++ """ ++ dataset_path = Path(dataset_path).resolve() ++ ++ # Determine output directory ++ if output_dir is None: ++ if dataset_path.is_file(): ++ merged_dir = dataset_path.parent / MERGED_DIR ++ else: ++ merged_dir = dataset_path / MERGED_DIR ++ else: ++ merged_dir = Path(output_dir).resolve() ++ ++ train_output = merged_dir / TRAIN_FILE ++ validation_output = merged_dir / VAL_FILE ++ ++ # Discover files ++ train_files, val_files = discover_dataset_files(dataset_path) ++ ++ # Merge training files ++ if len(train_files) == 1 and not val_files: ++ # Single file, no validation - need to split ++ logger.info("Single training file with no validation data - creating split") ++ train_samples, validation_samples = _create_val_split( ++ train_files[0], ++ train_output, ++ validation_output, ++ val_ratio=val_split_ratio, ++ seed=seed, ++ ) ++ else: ++ # Merge training files ++ train_samples = _merge_files(train_files, train_output) ++ logger.info(f"Merged {len(train_files)} training file(s) → {train_output} ({train_samples} samples)") ++ ++ if val_files: ++ # Merge validation files ++ validation_samples = _merge_files(val_files, validation_output) ++ logger.info( ++ f"Merged {len(val_files)} validation file(s) → {validation_output} ({validation_samples} samples)" ++ ) ++ else: ++ # Auto-split from merged training file ++ logger.info("No validation files - creating split from merged training data") ++ # Read merged, split, re-write ++ train_samples, validation_samples = _create_val_split( ++ train_output, ++ train_output, ++ validation_output, ++ val_ratio=val_split_ratio, ++ seed=seed, ++ ) ++ ++ return PreparedDataset( ++ merged_dir=merged_dir, ++ train_file=train_output, ++ validation_file=validation_output, ++ train_samples=train_samples, ++ validation_samples=validation_samples, ++ ) +diff --git a/services/automodel/src/nmp/automodel/tasks/training/datasets/schemas.py b/services/automodel/src/nmp/automodel/tasks/training/datasets/schemas.py +new file mode 100644 +index 000000000..487c0151e +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/datasets/schemas.py +@@ -0,0 +1,430 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++# ++# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual ++# property and proprietary rights in and to this material, related ++# documentation and any modifications thereto. Any use, reproduction, ++# disclosure or distribution of this material and related documentation ++# without an express license agreement from NVIDIA CORPORATION or ++# its affiliates is strictly prohibited. ++ ++# ============================================================================= ++# Dataset Schemas for DPO Training ++# ============================================================================= ++# Preference Dataset Schemas for DPO Training: ++# - PreferenceDataset: Native format with context + ranked completions ++# - BinaryPreferenceDataset: Simple prompt/chosen/rejected strings ++# - HelpSteer3Dataset: NVIDIA HelpSteer3 format with preference scores ++# - Tulu3PreferenceDataset: AllenAI Tulu3 format with message lists ++# ++# SFT Dataset Schemas: ++# - SFTDatasetItemSchema: Standard prompt/completion format ++from typing import Annotated, Any, List, Literal, Optional, Union ++ ++from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag, model_validator ++ ++# Dataset class names from nmp.automodel.tasks.training.backends.nemo_rl.preference_datasets ++# These constants ensure consistency between the discriminator and Tag values ++PREFERENCE_DATASET = "PreferenceDataset" ++BINARY_PREFERENCE_DATASET = "BinaryPreferenceDataset" ++HELPSTEER3_DATASET = "HelpSteer3" ++TULU3_PREFERENCE_DATASET = "Tulu3Preference" ++ ++ ++class ChatMessage(BaseModel): ++ """A single message in a conversation.""" ++ ++ role: str = Field(..., description="The role of the message sender (e.g., 'user', 'assistant', 'system')") ++ content: str = Field(..., description="The content of the message") ++ ++ ++class CompletionItem(BaseModel): ++ """A ranked completion in a preference dataset.""" ++ ++ rank: int = Field(..., description="Rank of this completion (0 = best/chosen, higher = worse)") ++ completion: List[ChatMessage] = Field(..., description="The completion as a list of messages") ++ ++ ++class PreferenceDatasetItemSchema(BaseModel): ++ """Schema for native PreferenceDataset format. ++ ++ This is the canonical format used by nemo-rl's PreferenceDataset class. ++ It supports multi-turn context and multiple ranked completions. ++ ++ Example: ++ { ++ "context": [{"role": "user", "content": "What is 2+2?"}], ++ "completions": [ ++ {"rank": 0, "completion": [{"role": "assistant", "content": "4"}]}, ++ {"rank": 1, "completion": [{"role": "assistant", "content": "5"}]} ++ ] ++ } ++ """ ++ ++ context: List[ChatMessage] = Field( ++ ..., description="The conversation context (prompt messages including previous turns)" ++ ) ++ completions: List[CompletionItem] = Field( ++ ..., description="List of ranked completions (rank 0 = preferred, rank 1 = rejected, etc.)" ++ ) ++ ++ model_config = ConfigDict(extra="allow") ++ ++ ++class BinaryPreferenceDatasetItemSchema(BaseModel): ++ """Schema for BinaryPreferenceDataset format. ++ ++ Simple format with prompt, chosen response, and rejected response as strings. ++ The prompt can be either a string or a list of messages. ++ ++ Example: ++ { ++ "prompt": "What is the capital of France?", ++ "chosen": "The capital of France is Paris.", ++ "rejected": "The capital of France is London." ++ } ++ """ ++ ++ prompt: Union[str, List[ChatMessage]] = Field(..., description="The input prompt (string or list of messages)") ++ chosen: str = Field(..., description="The preferred/chosen response") ++ rejected: str = Field(..., description="The rejected/non-preferred response") ++ ++ model_config = ConfigDict(extra="allow") ++ ++ ++class HelpSteer3DatasetItemSchema(BaseModel): ++ """Schema for NVIDIA HelpSteer3 preference dataset format. ++ ++ Uses numeric preference scores to indicate which response is preferred. ++ - Negative overall_preference: response1 is preferred ++ - Positive overall_preference: response2 is preferred ++ - Zero overall_preference: tie (no preference) ++ ++ Example: ++ { ++ "context": "Explain quantum computing", ++ "response1": "Quantum computing uses qubits...", ++ "response2": "Quantum computing is magic...", ++ "overall_preference": -2 ++ } ++ """ ++ ++ context: Union[str, List[ChatMessage]] = Field(..., description="The input context (string or list of messages)") ++ response1: str = Field(..., description="First response option") ++ response2: str = Field(..., description="Second response option") ++ overall_preference: int = Field( ++ ..., ++ description="Preference score: negative=response1 preferred, positive=response2 preferred, 0=tie", ++ ) ++ ++ model_config = ConfigDict(extra="allow") ++ ++ ++class Tulu3PreferenceDatasetItemSchema(BaseModel): ++ """Schema for AllenAI Tulu3 preference dataset format. ++ ++ Contains full conversation histories for both chosen and rejected responses. ++ The last message in each list must be from the assistant role. ++ ++ Example: ++ { ++ "chosen": [ ++ {"role": "user", "content": "Hello"}, ++ {"role": "assistant", "content": "Hi! How can I help?"} ++ ], ++ "rejected": [ ++ {"role": "user", "content": "Hello"}, ++ {"role": "assistant", "content": "Go away."} ++ ] ++ } ++ """ ++ ++ chosen: List[ChatMessage] = Field( ++ ..., description="Full conversation with preferred response (last message must be assistant)" ++ ) ++ rejected: List[ChatMessage] = Field( ++ ..., description="Full conversation with rejected response (last message must be assistant)" ++ ) ++ ++ model_config = ConfigDict(extra="allow") ++ ++ ++def get_preference_dataset_discriminator(v: Any) -> str: ++ """Determine the preference dataset schema type based on field presence. ++ ++ This discriminator function examines the fields present in the data ++ to determine which schema type it matches. Returns the NeMo RL dataset ++ class name that corresponds to the detected format: ++ - PreferenceDataset: Has 'context' and 'completions' fields (native format) ++ - HelpSteer3: Has 'overall_preference' field (HelpSteer3 format) ++ - Tulu3PreferenceDataset: Has 'chosen' and 'rejected' as lists of messages ++ - BinaryPreferenceDataset: Has 'prompt', 'chosen', 'rejected' ++ ++ Args: ++ v: The data to discriminate (dict or model instance) ++ ++ Returns: ++ NeMo RL dataset class name identifying the schema type ++ """ ++ if isinstance(v, dict): ++ # Native PreferenceDataset format: context + completions ++ if "completions" in v and "context" in v: ++ return PREFERENCE_DATASET ++ ++ # HelpSteer3 format: has overall_preference score ++ if "overall_preference" in v: ++ return HELPSTEER3_DATASET ++ ++ # Tulu3 format: chosen/rejected are lists of messages (must check BEFORE BinaryPreferenceDataset) ++ # Tulu3 data may also have 'prompt' field, so we differentiate by checking if chosen/rejected are lists ++ if "chosen" in v and "rejected" in v: ++ chosen = v.get("chosen") ++ if isinstance(chosen, list) and len(chosen) > 0: ++ # Check if it looks like a message list ++ if isinstance(chosen[0], dict) and "role" in chosen[0]: ++ return TULU3_PREFERENCE_DATASET ++ ++ # BinaryPreferenceDataset format: prompt + chosen + rejected (as strings) ++ if "prompt" in v and "chosen" in v and "rejected" in v: ++ return BINARY_PREFERENCE_DATASET ++ ++ return PREFERENCE_DATASET # Default fallback ++ ++ ++# Union type for all preference dataset formats ++DPOPreferenceDatasetSchemaType = Annotated[ ++ Union[ ++ Annotated[PreferenceDatasetItemSchema, Tag(PREFERENCE_DATASET)], ++ Annotated[BinaryPreferenceDatasetItemSchema, Tag(BINARY_PREFERENCE_DATASET)], ++ Annotated[HelpSteer3DatasetItemSchema, Tag(HELPSTEER3_DATASET)], ++ Annotated[Tulu3PreferenceDatasetItemSchema, Tag(TULU3_PREFERENCE_DATASET)], ++ ], ++ Discriminator(get_preference_dataset_discriminator), ++] ++ ++ ++# ============================================================================= ++# SFT Dataset Schemas ++# ============================================================================= ++class SFTPromptTemplateDatasetItemSchema(BaseModel): ++ """Schema for standard SFT (Supervised Fine-Tuning) dataset format. ++ ++ The standard format has prompt and completion fields, but allows additional ++ fields for custom templates (e.g., {input}, {output}, {instruction}, etc.). ++ ++ Example (standard format): ++ { ++ "prompt": "What is the capital of France?", ++ "completion": "The capital of France is Paris." ++ } ++ ++ Example (custom template format): ++ { ++ "instruction": "Answer the question", ++ "input": "What is the capital of France?", ++ "output": "The capital of France is Paris." ++ } ++ """ ++ ++ model_config = ConfigDict(extra="allow") ++ ++ # Make all fields optional so custom templates can use any field names ++ prompt: Optional[str] = Field(None, description="The input prompt (standard format)") ++ completion: Optional[str] = Field(None, description="The expected completion/output (standard format)") ++ ++ ++class FunctionCallDetails(BaseModel): ++ """Details of a function call made by a tool call. ++ ++ Example: ++ { ++ "name": "get_weather", ++ "arguments": {"location": "San Francisco"} ++ } ++ """ ++ ++ name: str = Field(..., description="The name of the function to call") ++ arguments: dict[str, Any] = Field(..., description="The arguments to pass to the function") ++ content_type: Optional[str] = Field(None, description="Optional content type of the function response") ++ ++ ++class ToolCall(BaseModel): ++ """A tool call in a message.""" ++ ++ type: Literal["function"] = Field(..., description="The type of tool call (must be 'function')") ++ function: FunctionCallDetails = Field(..., description="Function call details including name and arguments") ++ ++ ++class SFTChatMessage(BaseModel): ++ """A single message in an SFT chat conversation. ++ ++ Each message must have a role and at least one of: content, thinking, or tool_calls. ++ ++ Important: content and thinking are mutually exclusive within a single message. ++ If both are needed, they should be in separate messages (e.g., one message with ++ thinking followed by another message with content). ++ """ ++ ++ role: str = Field(..., description="The role of the message sender (e.g., 'user', 'assistant', 'system')") ++ content: str | None = Field(None, description="The content of the message") ++ thinking: str | None = Field(None, description="Thinking/reasoning content") ++ tool_calls: list[ToolCall] | None = Field(None, description="Tool calls made in this message") ++ ++ @staticmethod ++ def _schema_extra(schema: dict[str, Any]) -> None: ++ """Add anyOf constraint requiring at least one of content, thinking, or tool_calls.""" ++ schema["anyOf"] = [ ++ { ++ "required": ["content"], ++ "properties": {"content": {"type": "string"}}, ++ "not": {"required": ["thinking"]}, ++ }, ++ { ++ "required": ["thinking"], ++ "properties": {"thinking": {"type": "string"}}, ++ "not": {"required": ["content"]}, ++ }, ++ {"required": ["tool_calls"], "properties": {"tool_calls": {"minItems": 1}}}, ++ ] ++ ++ model_config = ConfigDict(extra="forbid", json_schema_extra=_schema_extra) ++ ++ @model_validator(mode="after") ++ def check_has_content_or_thinking_or_tool_calls(self) -> "SFTChatMessage": ++ """Validate that message has at least one of content, thinking, or tool_calls. ++ ++ Also enforces that content and thinking are mutually exclusive - they cannot ++ both be present in the same message. ++ """ ++ if self.content is None and self.thinking is None and self.tool_calls is None: ++ raise ValueError("Message must have at least one of: content, thinking, or tool_calls") ++ ++ if self.content is not None and self.thinking is not None: ++ raise ValueError("Message cannot have both content and thinking - they are mutually exclusive") ++ ++ return self ++ ++ ++class FunctionParameters(BaseModel): ++ """Parameters schema for a function definition. ++ ++ Example: ++ { ++ "type": "object", ++ "properties": { ++ "location": {"type": "string", "description": "The city name"} ++ } ++ } ++ """ ++ ++ type: Literal["object"] = Field(..., description="The type of parameters (must be 'object')") ++ properties: dict[str, Any] = Field(..., description="The properties/arguments the function accepts") ++ ++ ++class FunctionDefinitionDetails(BaseModel): ++ """Details of a function definition for tool calling. ++ ++ Example: ++ { ++ "name": "get_weather", ++ "description": "Get the current weather for a location", ++ "parameters": {"type": "object", "properties": {...}}, ++ "required": ["location"] ++ } ++ """ ++ ++ name: str = Field(..., description="The name of the function") ++ description: str = Field(..., description="A description of what the function does") ++ parameters: FunctionParameters = Field(..., description="The parameters schema for the function") ++ required: list[str] | None = Field(None, description="List of required parameter names") ++ ++ ++class ToolDefinition(BaseModel): ++ """A tool definition for function calling.""" ++ ++ type: Literal["function"] = Field(..., description="The type of tool (must be 'function')") ++ function: FunctionDefinitionDetails = Field( ++ ..., description="Function definition with name, description, and parameters" ++ ) ++ ++ ++class SFTPChatDatasetItemSchema(BaseModel): ++ """Schema for SFT chat format based on MESSAGES_SCHEMA. ++ ++ This format represents conversations with message lists and optional tool definitions. ++ ++ Example: ++ { ++ "messages": [ ++ {"role": "user", "content": "What is 2+2?"}, ++ {"role": "assistant", "content": "4"} ++ ], ++ "tools": [...] # optional ++ } ++ """ ++ ++ messages: list[SFTChatMessage] = Field(..., description="List of messages in the conversation") ++ tools: list[ToolDefinition] | None = Field( ++ None, description="Optional tool definitions available in the conversation" ++ ) ++ ++ model_config = ConfigDict(extra="allow") ++ ++ ++# Embedding Dataset Schemas ++class EmbeddingDatasetItemSchema(BaseModel): ++ """Schema for embedding dataset format. ++ ++ Example: ++ { ++ "query": "What is machine learning?", ++ "pos_doc": "Machine learning is a branch of AI...", ++ "neg_doc": ["Deep learning is...", "Neural networks are..."] ++ } ++ """ ++ ++ query: str = Field(..., description="The query text") ++ pos_doc: str = Field(..., description="The positive document") ++ neg_doc: list[str] = Field(..., description="List of negative documents") ++ ++ model_config = ConfigDict(extra="allow") ++ ++ ++def get_sft_dataset_discriminator(v: Any) -> str: ++ """Determine the SFT dataset schema type based on field presence. ++ ++ This discriminator examines the fields to determine format: ++ - "EmbeddingDatasetItemSchema": Has 'query', 'pos_doc', 'neg_doc' fields (embedding format) ++ - "SFTChatDatasetItemSchema": Has 'messages' field (chat format) ++ - "SFTPromptTemplateDatasetItemSchema": Has other fields (prompt template format) ++ ++ Args: ++ v: The data to discriminate (dict or model instance) ++ ++ Returns: ++ Schema type name identifying the format ++ """ ++ if isinstance(v, dict): ++ # Embedding format: has query, pos_doc, neg_doc fields ++ if "query" in v and "pos_doc" in v and "neg_doc" in v: ++ return "EmbeddingDatasetItemSchema" ++ ++ # Chat format: has messages array ++ if "messages" in v: ++ return "SFTChatDatasetItemSchema" ++ ++ # Prompt template format: has prompt/completion or custom fields ++ return "SFTPromptTemplateDatasetItemSchema" ++ ++ return "SFTPromptTemplateDatasetItemSchema" # Default fallback ++ ++ ++# Union type for all SFT dataset formats ++SFTDatasetSchemaType = Annotated[ ++ Union[ ++ Annotated[SFTPromptTemplateDatasetItemSchema, Tag(str(SFTPromptTemplateDatasetItemSchema.__name__))], ++ Annotated[SFTPChatDatasetItemSchema, Tag(str(SFTPChatDatasetItemSchema.__name__))], ++ Annotated[EmbeddingDatasetItemSchema, Tag(str(EmbeddingDatasetItemSchema.__name__))], ++ ], ++ Discriminator(get_sft_dataset_discriminator), ++] +diff --git a/services/automodel/src/nmp/automodel/tasks/training/datasets/validation.py b/services/automodel/src/nmp/automodel/tasks/training/datasets/validation.py +new file mode 100644 +index 000000000..3e4e7f200 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/datasets/validation.py +@@ -0,0 +1,297 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++# ++# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual ++# property and proprietary rights in and to this material, related ++# documentation and any modifications thereto. Any use, reproduction, ++# disclosure or distribution of this material and related documentation ++# without an express license agreement from NVIDIA CORPORATION or ++# its affiliates is strictly prohibited. ++ ++import json ++import logging ++import os ++import re ++from typing import Any, Callable, Optional ++ ++import jsonschema ++from jsonschema import exceptions ++from nmp.automodel.entities.values import FinetuningType, TrainingType ++from nmp.automodel.tasks.training.datasets.preparation import DatasetFormatError ++from nmp.automodel.tasks.training.datasets.schemas import SFTDatasetSchemaType ++ ++logger = logging.getLogger(__name__) ++ ++ ++def SFT_SCHEMA(prompt_template: str | None = None): ++ """Generate JSON schema for SFT datasets. ++ ++ Uses the SFTDatasetSchemaType union which supports: ++ - SFTPromptTemplateDatasetItemSchema: Flexible prompt template format ++ - SFTPChatDatasetItemSchema: Chat format with messages and tools ++ ++ Args: ++ prompt_template: Optional template string with placeholders like "{input} {output}". ++ If None or empty string, defaults to standard prompt/completion format. ++ Ignored for chat format detection. ++ ++ Returns: ++ JSON schema dict with required fields based on the format. ++ """ ++ from pydantic import TypeAdapter ++ ++ # Determine required fields for prompt template format ++ if prompt_template is not None and prompt_template != "": ++ # Extract placeholders from template ++ found_keys = re.findall(r"{(.*?)}", prompt_template) ++ ++ # TODO: Are we constrained by len == 2? ++ # Check for duplicates ++ if len(found_keys) != len(set(found_keys)): ++ duplicates = [key for key in found_keys if found_keys.count(key) > 1] ++ unique_duplicates = list(dict.fromkeys(duplicates)) ++ raise ValueError( ++ f"Prompt template contains duplicate placeholders: {unique_duplicates}. " ++ f"Each placeholder should appear only once." ++ ) ++ ++ prompt_template_keys = found_keys ++ else: ++ prompt_template_keys = ["prompt", "completion"] ++ ++ # Create TypeAdapter for the SFT union type to generate base JSON schema ++ adapter = TypeAdapter(SFTDatasetSchemaType) ++ schema = adapter.json_schema() ++ ++ # Add JSON schema metadata ++ schema["$schema"] = "https://json-schema.org/draft/2020-12/schema" ++ schema["title"] = "SFT Schema" ++ ++ # Update the prompt template sub-schema with required fields from prompt_template_keys ++ # The schema structure has $defs with the actual schemas, and oneOf/anyOf with $ref pointers ++ if "$defs" in schema: ++ # Update the SFTPromptTemplateDatasetItemSchema in $defs ++ if "SFTPromptTemplateDatasetItemSchema" in schema["$defs"]: ++ template_schema = schema["$defs"]["SFTPromptTemplateDatasetItemSchema"] ++ # Add template fields as required properties ++ if "properties" not in template_schema: ++ template_schema["properties"] = {} ++ for key in prompt_template_keys: ++ template_schema["properties"][key] = {"type": "string"} ++ template_schema["required"] = prompt_template_keys ++ template_schema["additionalProperties"] = True ++ return schema ++ ++ ++SCHEMAS: dict[str, Callable[[str | None], dict]] = { ++ TrainingType.SFT.value: SFT_SCHEMA, ++ TrainingType.DISTILLATION.value: SFT_SCHEMA, ++} ++ ++ ++class DatasetValidator: ++ """Validator for training datasets. ++ ++ This class encapsulates dataset validation logic and avoids parameter drilling ++ by storing configuration as instance attributes. ++ ++ Example usage after prepare_dataset(): ++ ```python ++ from nmp.automodel.tasks.training.datasets.preparation import prepare_dataset ++ from nmp.automodel.tasks.training.datasets.validation import DatasetValidator ++ ++ # Prepare datasets ++ prepared = prepare_dataset( ++ dataset_path=Path(customizer_config.dataset.path), ++ output_dir=workspace_dir / "dataset", ++ ) ++ ++ # Validate the prepared datasets ++ validator = DatasetValidator( ++ training_type=customizer_config.training.training_type, ++ finetuning_type=customizer_config.training.finetuning_type, ++ prompt_template=customizer_config.dataset.prompt_template, ++ ) ++ validator.validate_dataset(str(prepared.train_file)) ++ validator.validate_dataset(str(prepared.validation_file)) ++ ``` ++ """ ++ ++ def __init__( ++ self, ++ training_type: TrainingType, ++ finetuning_type: Optional[FinetuningType] = None, ++ prompt_template: str | None = None, ++ ): ++ """Initialize validator with training configuration. ++ ++ Args: ++ training_type: The type of training (SFT, distillation, etc.) ++ finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) ++ prompt_template: Optional prompt template for SFT datasets ++ """ ++ self.training_type = training_type ++ self.finetuning_type = finetuning_type ++ self.prompt_template = prompt_template ++ ++ def _validate_json_object(self, obj: dict, schema: dict[str, Any]) -> None: ++ """Validate a JSON object against a schema. ++ ++ Args: ++ obj: The JSON object to validate ++ schema: The JSON schema to validate against ++ ++ Raises: ++ TypeError: If validation fails ++ """ ++ try: ++ jsonschema.validate(instance=obj, schema=schema) ++ except exceptions.ValidationError as e: ++ logger.debug(f"Dataset Schema Validation failed: {str(e)}") ++ raise TypeError(f"Dataset Schema Validation failed: {e.message}") ++ except Exception as e: ++ logger.debug(f"Dataset Schema Validation failed: {str(e)}") ++ raise TypeError(f"Dataset Schema Validation failed: {e}") ++ ++ def detect_dataset_schema(self, file_path: str) -> str: ++ """Detect the dataset schema from the first line of the file. ++ ++ Args: ++ file_path: Path to the dataset file ++ ++ Returns: ++ Schema name (e.g., 'sft', 'dpo', 'chat') ++ ++ Raises: ++ DatasetFormatError: If file format is invalid or doesn't match any schema ++ """ ++ with open(file_path, "r", encoding="utf-8") as f: ++ line = f.readline() ++ ++ try: ++ obj: dict[str, Any] = json.loads(line) ++ except Exception as e: ++ logger.debug(f"{file_path} has entry which is not valid json. Error: {e}\n{line}") ++ raise DatasetFormatError(f"{file_path} has entry which is not a valid json: {e}") ++ ++ for schema_name, schema_factory in SCHEMAS.items(): ++ try: ++ validation_schema = schema_factory(self.prompt_template) ++ self._validate_json_object(obj, validation_schema) ++ except Exception as e: ++ logger.debug(f"Parsed jsonl line does not conform to schema {schema_name}. Error: {e}") ++ else: ++ logger.debug(f"Parsed jsonl line conforms to schema {schema_name}.") ++ return schema_name ++ ++ raise DatasetFormatError("Dataset does not match any supported format") ++ ++ def validate_dataset(self, file_path: str, dataset_type: Optional[str] = None) -> None: ++ """Validate a single dataset file. ++ ++ Args: ++ file_path: Path to the dataset file ++ dataset_type: Optional dataset type to validate against. If None, uses training type from config ++ ++ Raises: ++ DatasetFormatError: If dataset is empty or validation fails ++ """ ++ # Use provided dataset_type or fall back to training type from config ++ if dataset_type is None: ++ dataset_type = self.training_type.value ++ ++ schema_factory = SCHEMAS.get(dataset_type) ++ if not schema_factory: ++ # Skip validation for unsupported types ++ return ++ ++ if os.path.getsize(file_path) == 0: ++ raise DatasetFormatError(f"{file_path} is empty") ++ ++ validation_schema = schema_factory(self.prompt_template) ++ ++ # Validate each line in the JSONL file ++ with open(file_path, "r", encoding="utf-8") as jsonl_file: ++ for line in jsonl_file: ++ line = line.strip() ++ if not line: ++ continue ++ ++ try: ++ obj: dict[str, Any] = json.loads(line) ++ except Exception as e: ++ logger.debug(f"{file_path} has entry which is not valid json. Error: {e}\n{line}") ++ raise DatasetFormatError(f"{file_path} has entry which is not valid json: {e}") ++ ++ try: ++ self._validate_json_object(obj, validation_schema) ++ except Exception as e: ++ logger.debug( ++ f"Parsed jsonl line does not conform to schema {validation_schema}. Error: {e}. Object: {obj}" ++ ) ++ raise DatasetFormatError( ++ f"Parsed jsonl line does not conform to schema {validation_schema}. Error: {e}" ++ ) ++ ++ ++# Backward compatibility: provide standalone functions that create a validator instance ++def detect_dataset_schema( ++ file_path: str, ++ training_type: TrainingType, ++ finetuning_type: Optional[FinetuningType] = None, ++ prompt_template: str | None = None, ++) -> str: ++ """Detect the dataset schema from the first line of the file. ++ ++ Args: ++ file_path: Path to the dataset file ++ training_type: The type of training (SFT, DPO, etc.) ++ finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) ++ prompt_template: Optional prompt template for SFT datasets ++ ++ Returns: ++ Schema name (e.g., 'sft', 'dpo', 'chat') ++ """ ++ validator = DatasetValidator(training_type, finetuning_type, prompt_template=prompt_template) ++ return validator.detect_dataset_schema(file_path) ++ ++ ++def validate_dataset( ++ file_path: str, ++ training_type: TrainingType, ++ dataset_type: Optional[str] = None, ++ finetuning_type: Optional[FinetuningType] = None, ++ prompt_template: str | None = None, ++) -> None: ++ """Validate a single dataset file. ++ ++ Args: ++ file_path: Path to the dataset file ++ dataset_type: Dataset type to validate against ++ training_type: The type of training (SFT, DPO, etc.) ++ finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) ++ prompt_template: Optional prompt template for SFT datasets ++ """ ++ validator = DatasetValidator(training_type, finetuning_type, prompt_template=prompt_template) ++ validator.validate_dataset(file_path, dataset_type) ++ ++ ++def validate_datasets( ++ file_names: list[str], ++ training_type: TrainingType, ++ dataset_type: Optional[str] = None, ++ finetuning_type: Optional[FinetuningType] = None, ++ prompt_template: str | None = None, ++) -> None: ++ """Validate a list of dataset files. ++ ++ Args: ++ file_names: List of dataset file paths to validate ++ dataset_type: Dataset type to validate against (sft, dpo, embedding) ++ training_type: The type of training (SFT, DPO, etc.) ++ finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) ++ prompt_template: Optional prompt template for SFT datasets (ignored for other dataset types) ++ """ ++ validator = DatasetValidator(training_type, finetuning_type, prompt_template=prompt_template) ++ for file_name in file_names: ++ validator.validate_dataset(file_name, dataset_type) +diff --git a/services/automodel/src/nmp/automodel/tasks/training/distributed.py b/services/automodel/src/nmp/automodel/tasks/training/distributed.py +new file mode 100644 +index 000000000..ebf0e06df +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/distributed.py +@@ -0,0 +1,245 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Distributed training coordination utilities. ++ ++Provides role detection and file-based barrier synchronization for multi-node ++training where multiple pods/containers run the same entry point. ++""" ++ ++import logging ++import os ++import shutil ++import time ++from dataclasses import dataclass, field ++from enum import Enum ++from pathlib import Path ++ ++logger = logging.getLogger(__name__) ++ ++# Environment variables for distributed training injected by Volcano's pytorch plugin. ++# Do not confuse these with the same env vars injected by torchrun. ++# Here, WORLD_SIZE refers to number of nodes, while torchrun's WORLD_SIZE is the number of GPUs. ++# RANK refers to the rank of the node, while torchrun's RANK is the global rank of the GPU. ++RANK_ENVVAR = "RANK" ++WORLD_SIZE_ENVVAR = "WORLD_SIZE" ++ ++ ++class DistributedRole(Enum): ++ """Role of this node in distributed training.""" ++ ++ COORDINATOR = "coordinator" # Rank 0 - runs all phases ++ WORKER = "worker" # Rank > 0 - only participates in training ++ ++ ++@dataclass ++class DistributedContext: ++ """ ++ Distributed training context with file-based barrier synchronization. ++ ++ In multi-node training, all pods run the same entry point. This context ++ provides: ++ - Role detection (coordinator vs worker) based on RANK ++ - File-based barriers for cross-pod synchronization ++ ++ File barriers work by: ++ - Coordinator creates marker files to signal phase completion ++ - Workers poll for marker files before proceeding ++ - All ranks can sync via mutual signal-and-wait ++ ++ Attributes: ++ role: Whether this node is coordinator (rank 0) or worker ++ rank: This node's rank in the distributed job ++ world_size: Total number of nodes participating ++ barrier_dir: Directory for barrier marker files (on shared storage). ++ Must be provided by caller for multi-node; None for single-node. ++ """ ++ ++ role: DistributedRole ++ rank: int ++ world_size: int ++ barrier_dir: Path ++ _barrier_timeout: float = field(default=600.0, repr=False) ++ _poll_interval: float = field(default=0.5, repr=False) ++ ++ @classmethod ++ def from_env(cls, barrier_dir: Path) -> "DistributedContext": ++ """ ++ Create distributed context from environment variables. ++ ++ The caller is responsible for constructing the barrier_dir path, ++ including any task-specific namespacing for pause/resume support. ++ ++ Args: ++ barrier_dir: Directory for barrier files (on shared storage). ++ Caller should namespace this by task ID for pause/resume support. ++ ++ Environment Variables: ++ RANK: This node's rank (default: 0) ++ WORLD_SIZE: Total number of nodes (default: 1) ++ ++ Returns: ++ Configured DistributedContext ++ """ ++ rank = int(os.environ.get(RANK_ENVVAR, "0")) ++ world_size = int(os.environ.get(WORLD_SIZE_ENVVAR, "1")) ++ ++ role = DistributedRole.COORDINATOR if rank == 0 else DistributedRole.WORKER ++ ++ # Setup barrier directory if distributed ++ if world_size > 1: ++ # Coordinator cleans up stale barriers from previous task runs ++ # (e.g., after pause/resume or retry). This must happen before ++ # workers start waiting, so we do it here at initialization. ++ if role == DistributedRole.COORDINATOR and barrier_dir.exists(): ++ logger.info(f"Cleaning up stale barriers from previous run: {barrier_dir}") ++ shutil.rmtree(barrier_dir, ignore_errors=True) ++ ++ barrier_dir.mkdir(parents=True, exist_ok=True) ++ ++ ctx = cls( ++ role=role, ++ rank=rank, ++ world_size=world_size, ++ barrier_dir=barrier_dir, ++ ) ++ ++ logger.info( ++ f"Distributed context: rank={rank}, world_size={world_size}, " ++ f"role={role.value}, barriers={'enabled' if ctx.is_distributed else 'disabled'}" ++ ) ++ ++ return ctx ++ ++ @property ++ def is_coordinator(self) -> bool: ++ """True if this is the coordinator node (rank 0).""" ++ return self.role == DistributedRole.COORDINATOR ++ ++ @property ++ def is_distributed(self) -> bool: ++ """True if running in multi-node mode.""" ++ return self.world_size > 1 ++ ++ # --- Barrier Implementation --- ++ ++ def _marker_path(self, barrier_name: str, rank: int) -> Path: ++ """Get path to barrier marker file for a specific rank.""" ++ return self.barrier_dir / f"{barrier_name}.rank{rank}.ready" ++ ++ def signal(self, barrier_name: str) -> None: ++ """ ++ Signal that this rank has reached a synchronization point. ++ ++ Creates a marker file indicating this rank is ready. ++ ++ Args: ++ barrier_name: Name of the barrier (should be unique per sync point) ++ """ ++ if not self.is_distributed: ++ return ++ ++ marker = self._marker_path(barrier_name, self.rank) ++ marker.touch() ++ logger.debug(f"Barrier signal: {barrier_name} (rank {self.rank})") ++ ++ def wait_for_coordinator(self, barrier_name: str, timeout: float | None = None) -> None: ++ """ ++ Wait for the coordinator (rank 0) to signal. ++ ++ Used by workers to wait for coordinator to complete a phase. ++ ++ Args: ++ barrier_name: Name of the barrier to wait for ++ timeout: Override default timeout (seconds) ++ ++ Raises: ++ TimeoutError: If coordinator doesn't signal within timeout ++ """ ++ if not self.is_distributed: ++ return ++ ++ if self.is_coordinator: ++ # Coordinator doesn't wait for itself ++ return ++ ++ timeout = timeout or self._barrier_timeout ++ marker = self._marker_path(barrier_name, rank=0) ++ start = time.time() ++ ++ logger.debug(f"Waiting for coordinator at barrier: {barrier_name}") ++ ++ while time.time() - start < timeout: ++ if marker.exists(): ++ logger.debug(f"Coordinator signaled barrier: {barrier_name}") ++ return ++ time.sleep(self._poll_interval) ++ ++ raise TimeoutError(f"Timeout waiting for coordinator at barrier '{barrier_name}' after {timeout}s") ++ ++ def wait_all(self, barrier_name: str, timeout: float | None = None) -> None: ++ """ ++ Wait for all ranks to reach this barrier. ++ ++ All ranks must call signal() before any rank proceeds. ++ ++ Args: ++ barrier_name: Name of the barrier ++ timeout: Override default timeout (seconds) ++ ++ Raises: ++ TimeoutError: If not all ranks signal within timeout ++ """ ++ if not self.is_distributed: ++ return ++ ++ timeout = timeout or self._barrier_timeout ++ start = time.time() ++ ++ logger.debug(f"Waiting for all ranks at barrier: {barrier_name}") ++ ++ while time.time() - start < timeout: ++ ready_count = sum(1 for r in range(self.world_size) if self._marker_path(barrier_name, r).exists()) ++ if ready_count >= self.world_size: ++ logger.debug(f"All ranks reached barrier: {barrier_name}") ++ return ++ time.sleep(self._poll_interval) ++ ++ # Report which ranks are missing for debugging ++ missing = [r for r in range(self.world_size) if not self._marker_path(barrier_name, r).exists()] ++ raise TimeoutError(f"Timeout at barrier '{barrier_name}' after {timeout}s. Missing ranks: {missing}") ++ ++ def sync_point(self, barrier_name: str, timeout: float | None = None) -> None: ++ """ ++ Synchronization point where all ranks must arrive before any proceed. ++ ++ Combines signal() and wait_all() - this rank signals and then waits ++ for all other ranks. ++ ++ Args: ++ barrier_name: Name of the sync point ++ timeout: Override default timeout (seconds) ++ """ ++ self.signal(barrier_name) ++ self.wait_all(barrier_name, timeout) ++ ++ def cleanup_barrier(self, barrier_name: str) -> None: ++ """ ++ Clean up barrier marker files (coordinator only). ++ ++ Call after all ranks have passed the barrier. ++ ++ Args: ++ barrier_name: Name of the barrier to clean up ++ """ ++ if not self.is_distributed or not self.is_coordinator: ++ return ++ ++ for r in range(self.world_size): ++ marker = self._marker_path(barrier_name, r) ++ try: ++ if marker.exists(): ++ marker.unlink() ++ except OSError as e: ++ logger.warning(f"Failed to clean up barrier marker {marker}: {e}") +diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/converter.py b/services/automodel/src/nmp/automodel/tasks/training/errors/converter.py +new file mode 100644 +index 000000000..4d5fb4a24 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/errors/converter.py +@@ -0,0 +1,108 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import logging ++import subprocess ++from pathlib import Path ++ ++from nmp.common.errors import ExceptionConverter, RulesLoader ++ ++from .exceptions import ( ++ EXCEPTION_REGISTRY, ++ CustomizerTrainingError, ++ ErrorDetails, ++ InternalError, ++ default_exception_handler, ++) ++ ++logger = logging.getLogger(__name__) ++ ++# Path to the error rules YAML file (relative to this module) ++_ERROR_RULES_PATH = Path(__file__).parent / "error_rules.yaml" ++ ++# Additional modules to search for exception types not in the registry ++# subprocess.TimeoutExpired is used for training timeout detection ++_FALLBACK_MODULES = [subprocess] ++ ++# Module-level singleton converter ++_converter: ExceptionConverter | None = None ++ ++ ++def _load_converter() -> ExceptionConverter: ++ """Load the converter from YAML rules.""" ++ logger.debug(f"Loading Customizer error rules from: {_ERROR_RULES_PATH}") ++ ++ converter = RulesLoader.from_yaml( ++ _ERROR_RULES_PATH, ++ exception_registry=EXCEPTION_REGISTRY, ++ default_handler=default_exception_handler, ++ fallback_exception=InternalError, ++ fallback_modules=_FALLBACK_MODULES, ++ ) ++ ++ logger.info(f"Loaded {converter.rule_count} Customizer error mapping rules") ++ return converter ++ ++ ++def get_error_converter() -> ExceptionConverter: ++ """ ++ Get the singleton ExceptionConverter for Customizer training errors. ++ ++ The converter is created once on first access and reused for the module's lifetime. ++ It loads rules from error_rules.yaml and uses InternalError as fallback. ++ ++ Returns: ++ Configured ExceptionConverter ready to convert exceptions. ++ ++ Raises: ++ FileNotFoundError: If error_rules.yaml is not found. ++ ValueError: If rules file has invalid syntax. ++ """ ++ global _converter ++ if _converter is None: ++ _converter = _load_converter() ++ return _converter ++ ++ ++def create_error_details(exception: Exception) -> ErrorDetails: ++ """ ++ Create error_details dict for Jobs service reporting. ++ ++ Converts the exception to a CustomizerTrainingError and returns ++ a dict suitable for passing to progress_reporter.report_error(). ++ ++ If the exception is already a CustomizerTrainingError, returns its ++ details directly without re-conversion. ++ ++ Uses the library's fallback mechanism (InternalError) for unmatched exceptions. ++ ++ Args: ++ exception: The exception to convert. ++ ++ Returns: ++ ErrorDetails with 'message', 'type', and 'detail' keys. ++ """ ++ # If already a CustomizerTrainingError, return its details directly ++ if isinstance(exception, CustomizerTrainingError): ++ return exception.to_error_details() ++ ++ # Convert using the library - fallback_exception=InternalError handles unmatched ++ converter = get_error_converter() ++ try: ++ converter.raise_converted_or_default(exception) ++ except CustomizerTrainingError as converted: ++ return converted.to_error_details() ++ except Exception as e: # noqa: BLE001 - intentional last-resort guard to guarantee dict return ++ # Unexpected exception type - wrap in InternalError to ensure we always return a dict ++ logger.warning(f"Unexpected exception type from converter: {type(e).__name__}: {e}") ++ exc = InternalError( ++ message=f"An internal error occurred. ({type(exception).__name__}: {exception})", ++ detail=str(exception), ++ ) ++ return exc.to_error_details() ++ ++ ++__all__ = [ ++ "get_error_converter", ++ "create_error_details", ++] +diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/error_rules.yaml b/services/automodel/src/nmp/automodel/tasks/training/errors/error_rules.yaml +new file mode 100644 +index 000000000..1a16801b7 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/errors/error_rules.yaml +@@ -0,0 +1,643 @@ ++# This file defines rules for converting low-level training exceptions ++# into user-friendly CustomizerTrainingError subclasses. ++# Rules for all backends are present in the same yaml file. ++# ++# Rules are evaluated in order; first match wins. ++# ++# Rule structure: ++# - : # When to match (pick ONE) ++# exception: # Exception class from EXCEPTION_REGISTRY ++# error_details: # Optional user-friendly message ++ ++ ++rules: ++ # =========================================================================== ++ # 1. TRAINING TIMEOUT (subprocess.TimeoutExpired) ++ # All backends ++ # =========================================================================== ++ ++ - type: TimeoutExpired # subprocess.TimeoutExpired from fallback_modules ++ exception: TrainingTimeoutError ++ error_details: "Training exceeded the maximum allowed time limit. To reduce training time: 1) Reduce max_steps or epochs, 2) Use a smaller dataset, 3) Use a smaller model, 4) Use LoRA/PEFT instead of all_weights fine-tuning (LoRA trains faster), or 5) Increase batch_size to process more samples per step (if GPU memory allows). If you need longer training times, contact your administrator to adjust the job timeout limits." ++ ++ # =========================================================================== ++ # 2. DATASET FORMAT ERRORS (400) ++ # =========================================================================== ++ ++ # --- Automodel --- ++ # Unsupported role in chat messages ++ - regex: "Unsupported role in messages: \\w+" ++ exception: DatasetFormatError ++ error_details: "Your dataset contains chat messages with an invalid role. Each message in a conversation must have a 'role' field with one of the following values: 'system' (for system prompts), 'user' (for user inputs), 'assistant' (for model responses), or 'tool' (for tool/function outputs). Please check your dataset and ensure all messages use valid roles." ++ ++ # --- NeMo-RL --- ++ # Text type error ++ - regex: "^text must be a string or a list of strings, got .+$" ++ exception: DatasetFormatError ++ error_details: "The 'text' field in your dataset has an invalid type. For NeMo-RL training (DPO/GRPO), the text field must be either a single string or a list of strings. Please check your dataset format and ensure the text field contains the correct data type." ++ ++ # Prompt file not found ++ - regex: "^Prompt file .+ not found$" ++ exception: DatasetFormatError ++ error_details: "The prompt template file specified in your training dataset configuration does not exist. Prompt templates define how your dataset samples are formatted for training. Please verify the prompt file path is correct and the file is accessible at the specified location." ++ ++ # --- Automodel --- ++ # Empty dataset ++ - regex: "^no sample to consume: \\d+$" ++ exception: DatasetFormatError ++ error_details: "Your dataset is empty or contains zero valid samples after filtering. This can happen if: 1) The dataset file is empty, 2) All samples were filtered out due to format issues, or 3) The dataset path is incorrect. Please verify your dataset contains valid training samples." ++ ++ # All samples consumed ++ - regex: "^no samples left to consume: \\d+, \\d+$" ++ exception: DatasetFormatError ++ error_details: "All samples in your dataset have been consumed before completing the requested number of training steps. This happens when your dataset is too small for the configured epochs or max_steps. Please either: 1) Add more samples to your dataset, 2) Reduce the number of epochs, or 3) Reduce max_steps." ++ ++ # Error loading example ++ - regex: "Error while loading example \\d+ from dataset .+" ++ exception: DatasetFormatError ++ error_details: "Failed to load a specific sample from your dataset. This typically indicates a malformed sample that doesn't match the expected format. Please check your dataset for: 1) Missing required fields, 2) Invalid JSON formatting, 3) Incorrect data types for fields. The error message includes the sample index to help you locate the problematic entry." ++ ++ # =========================================================================== ++ # 3. MODEL NOT FOUND ERRORS (404) ++ # Megatron Bridge ++ # =========================================================================== ++ ++ # Checkpoint file not found (input model checkpoint for training) ++ - regex: "^Checkpoint file not found: .+$" ++ exception: ModelNotFoundError ++ error_details: "The input model checkpoint file could not be found. Please verify the base model path is correct and accessible. This checkpoint is used as the starting point for training." ++ ++ # No checkpoints found for resume (output checkpoint directory empty) ++ - regex: "There were no checkpoints found in checkpoint_dir.*Cannot resume" ++ exception: ModelNotFoundError ++ error_details: "The output checkpoint directory is empty. Cannot resume training because no previous training checkpoints were found. Ensure a prior training run completed successfully and saved checkpoints." ++ ++ # Nemotron model missing HF source ++ - regex: "Nemotron Super models expect HF source code to exist at .+" ++ exception: ModelNotFoundError ++ error_details: "The Nemotron Super model checkpoint is missing the required HuggingFace source code directory (nemotron_src/). This directory must be present inside the model checkpoint. Please ensure you are using a complete Nemotron Super model checkpoint that includes the HuggingFace source files." ++ ++ # =========================================================================== ++ # 4. MODEL LOAD ERRORS (500) ++ # =========================================================================== ++ ++ # --- Automodel --- ++ # Model weights swap failure ++ - contains: "_apply(): Couldn't swap" ++ exception: ModelLoadError ++ error_details: "Failed to load the base model: weights could not be applied to a model layer. The base model checkpoint may be corrupted, incomplete, or incompatible with the selected training configuration." ++ ++ # Model patching failure ++ - exact: "Failed to patch model" ++ exception: ModelLoadError ++ error_details: "Failed to apply optimizations to the base model. The base model architecture may not be supported for the selected training configuration. Try using a different model or training method." ++ ++ # Method signature mismatch ++ - starts_with: "Signature mismatch:" ++ exception: ModelLoadError ++ error_details: "The base model has an incompatible method signature. This typically indicates a version mismatch between the base model and the training framework. Please verify you are using a supported model version." ++ ++ # Missing lm_head.weight ++ - exact: "lm_head.weight not found in model" ++ exception: ModelLoadError ++ error_details: "The base model is missing the language model head (lm_head.weight). The base model checkpoint may be corrupted, incomplete, or not a valid language model. Please verify the base model is a complete, valid language model checkpoint." ++ ++ # --- NeMo-RL --- ++ # vLLM not installed ++ - contains: "vLLM is not installed" ++ exception: ModelLoadError ++ error_details: "vLLM is not installed in the training environment. This is an issue with the training environment setup, please contact the administrator to raise an issue with the NeMo Platform team." ++ ++ # Missing generation output keys ++ - regex: "^Missing required keys for GenerationOutputSpec: .+$" ++ exception: ModelLoadError ++ error_details: "The base model's generation output is missing required fields. The base model may not be compatible with the selected training method (e.g., GRPO). Please verify you are using a supported model for this training type." ++ ++ # Missing score output keys ++ - regex: "^Missing required keys for ScoreOutputSpec: .+$" ++ exception: ModelLoadError ++ error_details: "The base model's score output is missing required fields. The base model may not be compatible with the selected training method. Please verify you are using a supported model for this training type." ++ ++ # Pretrained run config not found (Megatron HF-to-mcore conversion) ++ - contains: "Pretrained run config not found at" ++ exception: ModelLoadError ++ error_details: "The pretrained model configuration file was not found after Megatron checkpoint conversion. This usually means the HuggingFace-to-Megatron conversion on the head node saved to a directory not accessible by this worker node. This is an infrastructure issue - please ensure shared storage is properly mounted across all nodes, or contact your administrator." ++ ++ # --- Megatron Bridge --- ++ # Shape mismatch for parameter ++ - regex: "^Shape mismatch for parameter .+: target shape .+ vs source shape .+$" ++ exception: ModelLoadError ++ error_details: "The base model parameter shape does not match the checkpoint. The base model checkpoint may be from a different model architecture or an incompatible version. Please ensure the base model matches the expected architecture for this training configuration." ++ ++ # Shape mismatch for buffer ++ - regex: "^Shape mismatch for buffer .+: .+ vs .+$" ++ exception: ModelLoadError ++ error_details: "The base model buffer shape does not match the checkpoint. The base model checkpoint may be corrupted, incomplete, or from an incompatible model version. Please verify the base model checkpoint is valid and complete." ++ ++ # =========================================================================== ++ # 5. TRAINING CONFIG ERRORS - PARALLELISM (400) ++ # =========================================================================== ++ ++ # --- Automodel --- ++ # Pipeline parallelism: tied embeddings not supported ++ - all_keywords: ["not compatible with pipeline parallelism", "tie_word_embeddings"] ++ exception: TrainingConfigError ++ error_details: "The base model has tied embeddings (tie_word_embeddings=True) which is not compatible with pipeline parallelism. Try using a different parallelism configuration or a model without tied embeddings." ++ ++ # Pipeline parallelism: encoder-decoder models not supported ++ - all_keywords: ["not compatible with pipeline parallelism", "Encoder-Decoder"] ++ exception: TrainingConfigError ++ error_details: "The base model is an encoder-decoder architecture (like T5 or BART) which is not supported with pipeline parallelism. Please use a decoder-only base model, or disable pipeline parallelism in your training configuration." ++ ++ # PP batch size / microbatch validation ++ - contains: "pp_batch_size // pp_microbatch_size must be >= pp_size" ++ exception: TrainingConfigError ++ error_details: "Pipeline parallelism requires: batch_size >= pipeline_parallel_size. The current batch_size is too small to fill all pipeline stages. Either increase batch_size or reduce pipeline_parallel_size." ++ ++ # Context parallelism: SDPA not supported ++ - contains: "Model does not support SDPA required for context parallelism" ++ exception: TrainingConfigError ++ error_details: "The base model does not support scaled dot-product attention (SDPA) which is required for context parallelism. Please set context_parallel_size=1 to disable context parallelism." ++ ++ # --- NeMo-RL --- ++ # Megatron and DTensor both enabled ++ - exact: "Configure either Megatron (policy.megatron_cfg.enabled=true) or DTensor (policy.dtensor_cfg.enabled=true), not both." ++ exception: TrainingConfigError ++ error_details: "Internal configuration error: both Megatron and DTensor training backends are enabled, but only one can be active at a time. This is an issue with the training environment setup, please contact the administrator." ++ ++ # Neither Megatron nor DTensor enabled ++ - contains: "Please either set policy.megatron_cfg.enabled=true" ++ exception: TrainingConfigError ++ error_details: "Internal configuration error: no training backend is enabled. The training environment requires either Megatron or DTensor backend to be active. This is an issue with the training environment setup, please contact the administrator." ++ ++ # World size insufficient for parallelism ++ - regex: "^World size \\(\\d+\\) is insufficient for the parallelism configuration" ++ exception: TrainingConfigError ++ error_details: "Not enough GPUs available for the requested parallelism settings. The total number of GPUs must be at least pipeline_parallel_size * context_parallel_size * tensor_parallel_size. Either reduce parallelism settings or request more GPUs." ++ ++ # World size not divisible by parallelism ++ - regex: "^World size \\(\\d+\\) must be divisible by PP \\* CP \\* TP" ++ exception: TrainingConfigError ++ error_details: "The total number of GPUs must be evenly divisible by (pipeline_parallel_size * context_parallel_size * tensor_parallel_size). For example, with PP=2, CP=1, TP=2, you need 4, 8, 12, etc. GPUs. Please adjust your parallelism settings or cluster size." ++ ++ # DTensor world size mismatch ++ - regex: "^World size\\(\\d+\\) must equal to dp_size\\(\\d+\\) \\* tp_size\\(\\d+\\) \\* cp_size\\(\\d+\\) to use DTensor$" ++ exception: TrainingConfigError ++ error_details: "The total number of GPUs (world_size) does not match the product of data_parallel_size * tensor_parallel_size * context_parallel_size for the DTensor backend. Please adjust your parallelism settings so they are consistent with the available GPU count." ++ ++ # Dynamic batching with PP > 1 ++ - contains: "Dynamic batching is only supported for single pipeline parallel stage" ++ exception: TrainingConfigError ++ error_details: "Dynamic batching is only supported when pipeline_parallel_size=1. With pipeline parallelism (PP > 1), the model is split across GPU stages which requires fixed batch sizes. Please either set pipeline_parallel_size=1 or disable dynamic batching." ++ ++ # Dynamic batching exclusive of sequence packing ++ - contains: "Dynamic Batching is exclusive of Sequence Packing" ++ exception: TrainingConfigError ++ error_details: "Dynamic batching and sequence packing cannot be used together. Please disable one of them: either set dynamic_batching=false or set sequence_packing_enabled=false." ++ ++ # Sequence packing not supported for VLM models ++ - contains: "Sequence packing is not supported for VLM models" ++ exception: TrainingConfigError ++ error_details: "Sequence packing is not supported for Vision-Language Models (VLMs). Please set sequence_packing_enabled=false when training VLM models." ++ ++ # Context parallel not supported for sequence packing (DTensor) ++ - exact: "Context parallel is not supported for sequence packing. Refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." ++ exception: TrainingConfigError ++ error_details: "Context parallelism cannot be used with sequence packing in the DTensor backend. Please either set context_parallel_size=1 to disable context parallelism, or set sequence_packing_enabled=false to disable sequence packing." ++ ++ # Context parallel not supported for Gemma3 ++ - contains: "Context parallel is not supported for Gemma3ForCausalLM" ++ exception: TrainingConfigError ++ error_details: "Context parallelism is not supported for Gemma3 models due to limitations in the PyTorch context parallel implementation. Please set context_parallel_size=1 when training Gemma3 models." ++ ++ # Context parallel not supported for VLM models ++ - contains: "Context parallel is yet not supported for VLM models" ++ exception: TrainingConfigError ++ error_details: "Context parallelism is not yet supported for Vision-Language Models (VLMs). Please set context_parallel_size=1 when training VLM models." ++ ++ # Context parallelism requires sequence packing (Megatron) ++ - contains: "Context Parallelism (CP>1) requires sequence packing to be enabled" ++ exception: TrainingConfigError ++ error_details: "When using the Megatron backend with context_parallel_size > 1, sequence packing must be enabled. Please either enable sequence packing (sequence_packing_enabled=true) or reduce context_parallel_size to 1." ++ ++ # Reward models not supported with Megatron backend ++ - contains: "Reward models are not yet supported with the Megatron backend" ++ exception: TrainingConfigError ++ error_details: "Reward models are not yet supported with the Megatron training backend. This is a current limitation of the framework. Please use the DTensor backend for reward model training, or contact your administrator for alternative configurations." ++ ++ # Dynamic sampling max batches reached ++ - contains: "Dynamic sampling has reached the maximum allowed number of batches" ++ exception: TrainingConfigError ++ error_details: "Dynamic sampling exceeded the maximum number of generation batches allowed per training step. This means the training data or reward signal is too challenging for the model to produce enough valid samples. Consider: 1) Simplifying your dataset, 2) Adjusting num_prompts_per_step or num_generations_per_prompt, 3) Checking that your reward function is not too strict." ++ ++ # Batch size not divisible by DP ++ - regex: "Configuration error: \\(num_prompts_per_step \\* num_generations_per_prompt\\) = \\d+ must be divisible by data_parallel size \\d+" ++ exception: TrainingConfigError ++ error_details: "The effective batch size (num_prompts_per_step * num_generations_per_prompt) must be evenly divisible by the number of data parallel workers. Please adjust num_prompts_per_step or num_generations_per_prompt so their product divides evenly." ++ ++ # =========================================================================== ++ # 6. TRAINING CONFIG ERRORS - DPO/GRPO (400) ++ # NeMo-RL ++ # =========================================================================== ++ ++ # Dynamic batching with DPO ++ - contains: "Dynamic batching is currently not supported with DPO" ++ exception: TrainingConfigError ++ error_details: "DPO (Direct Preference Optimization) training does not support dynamic batching. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # Sequence packing with DPO ++ - contains: "Sequence packing is currently not supported with DPO" ++ exception: TrainingConfigError ++ error_details: "DPO (Direct Preference Optimization) training does not support sequence packing. Please set sequence_packing_enabled=false in your training request." ++ ++ # GRPO requires generation config ++ - contains: "A generation config in the PolicyConfig is required for GRPO" ++ exception: TrainingConfigError ++ error_details: "GRPO (Group Relative Policy Optimization) requires a generation configuration to produce responses during training. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # Validation dataset required ++ - exact: "Validation dataset is required if validation is enabled" ++ exception: TrainingConfigError ++ error_details: "Validation is enabled for this training job, but no validation dataset was provided. Please provide a validation dataset in your training request, or disable validation." ++ ++ # Non-colocated inference with Megatron ++ - contains: "Non-colocated inference is not supported for Megatron generation backends" ++ exception: TrainingConfigError ++ error_details: "The current training configuration uses Megatron for generation, which does not support the required inference mode. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # Async GRPO requires vLLM async ++ - contains: "Async GRPO requires vLLM backend with vllm_cfg.async_engine=True" ++ exception: TrainingConfigError ++ error_details: "Async GRPO training requires the vLLM backend with async engine enabled, but the current configuration does not have this set. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # Async GRPO requires importance sampling ++ - contains: "Importance sampling correction must be enabled for async GRPO" ++ exception: TrainingConfigError ++ error_details: "Async GRPO training requires importance sampling correction to handle off-policy samples and ensure stable training. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # Async GRPO doesn't support colocated inference ++ - contains: "Colocated inference is not supported for async GRPO" ++ exception: TrainingConfigError ++ error_details: "Async GRPO training does not support colocated inference (running training and generation on the same GPUs). This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # top_k sampling threshold (vLLM V1 engine limitation) ++ - contains: "top_k sampling with values <" ++ exception: TrainingConfigError ++ error_details: "The top_k value is too low for the vLLM V1 engine. The vLLM V1 engine does not return logprobs after top_k filtering, so very low top_k values produce inaccurate logprob computations. Please increase top_k or remove the top_k constraint." ++ ++ # top_p sampling threshold (vLLM V1 engine limitation) ++ - contains: "top_p sampling with values <" ++ exception: TrainingConfigError ++ error_details: "The top_p value is too low for the vLLM V1 engine. The vLLM V1 engine does not return logprobs after top_p filtering, so very low top_p values produce inaccurate logprob computations. Please increase top_p or remove the top_p constraint." ++ ++ # MoE aux loss not supported ++ - contains: "MoE aux loss is currently not supported" ++ exception: TrainingConfigError ++ error_details: "Mixture-of-Experts (MoE) auxiliary loss is not currently supported due to a known bug in Megatron-LM. Please disable the MoE auxiliary loss in your training configuration." ++ ++ # =========================================================================== ++ # 7. TRAINING CONFIG ERRORS - PEFT/LORA (400) ++ # Automodel ++ # =========================================================================== ++ ++ # Triton not installed ++ - contains: "triton is not installed" ++ exception: TrainingConfigError ++ error_details: "The Triton library, which is required for optimized LoRA kernel operations, is not installed in the training environment. This is an issue with the training environment setup, please contact the administrator to ensure Triton is properly installed." ++ ++ # LoRA dimensions mismatch ++ - contains: "Incompatible X and LoRA A dimensions" ++ exception: TrainingConfigError ++ error_details: "The LoRA adapter dimensions are incompatible with the base model's layer dimensions. This can happen if you are trying to apply a pre-trained LoRA adapter that was created for a different model architecture. Please ensure the LoRA configuration (lora_dim/rank) is compatible with the base model you are fine-tuning." ++ ++ # =========================================================================== ++ # 8. TRAINING CONFIG ERRORS - PACKING (400) ++ # NeMo-RL ++ # =========================================================================== ++ ++ # Sequence too long for packing ++ - regex: "^Sequence length \\d+ exceeds bin capacity \\d+$" ++ exception: TrainingConfigError ++ error_details: "When sequence packing is enabled, one or more sequences in your dataset exceed the maximum sequence length (max_seq_length). Sequence packing combines multiple shorter sequences into a single training sample, but each individual sequence must fit within max_seq_length. Please either increase max_seq_length to accommodate longer sequences, or preprocess your dataset to truncate or remove sequences that are too long." ++ ++ # Not enough sequences for packing ++ - regex: "^Cannot create \\d+ bins with only \\d+ sequences" ++ exception: TrainingConfigError ++ error_details: "When sequence packing is enabled, the packing algorithm needs enough sequences to efficiently fill the training batches. Your dataset does not have enough sequences for the current batch configuration. Please either add more samples to your dataset, reduce the batch_size, or disable sequence packing by setting sequence_packing_enabled=false." ++ ++ # =========================================================================== ++ # 9. ENVIRONMENT ERRORS (400) ++ # NeMo-RL ++ # =========================================================================== ++ ++ # Unable to find compatible environment ++ - regex: "^Unable to find compatible environment - .+$" ++ exception: TrainingEnvironmentError ++ error_details: "The specified GRPO environment name is not recognized. GRPO (Group Relative Policy Optimization) requires a valid environment that defines how to evaluate model responses. Please check the environment name in your training request and ensure it matches one of the supported environments for your use case." ++ ++ # GRPO environment required ++ - exact: "hyperparameters.environment is required for GRPO, but it is not set" ++ exception: TrainingEnvironmentError ++ error_details: "GRPO (Group Relative Policy Optimization) training requires an environment configuration to evaluate model responses and compute rewards. Please specify the environment in your training request's hyperparameters. The environment determines how the model's generated responses will be scored during reinforcement learning." ++ ++ # No environment for task type ++ - regex: "^No environment found for task type: .+$" ++ exception: TrainingEnvironmentError ++ error_details: "No GRPO environment is registered for the specified task type. The environment defines how model responses are evaluated during reinforcement learning. This may indicate an unsupported task type or a misconfiguration. Please verify your task type is supported for GRPO training." ++ ++ # =========================================================================== ++ # 10. CHECKPOINT ERRORS (500) ++ # =========================================================================== ++ ++ # --- Automodel --- ++ # Checkpoint directory already exists ++ - regex: "Checkpoint directory .* already exists" ++ exception: CheckpointError ++ error_details: "The output checkpoint directory already exists from a previous training run. This typically happens when a previous training job failed or was cancelled but left partial checkpoint files behind. Please use a clean output directory, if you do not have access to remove the existing checkpoint directory, contact your administrator." ++ ++ # Global plan validation failure ++ - exact: "Failed to validate global plan" ++ exception: CheckpointError ++ error_details: "Checkpoint validation failed during distributed checkpoint loading. This occurs when the 'global plan' (which coordinates how model weights are distributed across GPUs) cannot be validated. Common causes include: 1) Corrupted checkpoint metadata files, 2) Mismatch between the number of GPUs used when saving vs loading the checkpoint, or 3) Interrupted checkpoint save operation. Please ensure the checkpoint is complete and you are using the same GPU topology as when the checkpoint was saved." ++ ++ # Missing key in checkpoint ++ - starts_with: "Missing key in checkpoint state_dict:" ++ exception: CheckpointError ++ error_details: "The checkpoint is missing one or more required model weights. This typically indicates that the checkpoint file is corrupted, incomplete (possibly from an interrupted save), or was created from a different model architecture than the one being loaded. Please verify the checkpoint is complete and matches the expected model architecture." ++ ++ # MoE expert weights missing ++ - contains: "Expert weights missing from checkpoint" ++ exception: CheckpointError ++ error_details: "The checkpoint for this Mixture-of-Experts (MoE) model is missing one or more expert weights. MoE models have multiple 'expert' sub-networks, and all expert weights must be present in the checkpoint. This typically indicates the checkpoint is corrupted or was saved incorrectly. Please use a complete, valid MoE checkpoint." ++ ++ # --- NeMo-RL --- ++ # Checkpoint file corrupted (JSONDecodeError) ++ - type_name: JSONDecodeError ++ exception: CheckpointError ++ error_details: "The checkpoint metadata file (training_info.json) is corrupted and cannot be parsed. This file stores training progress information like the current step and loss values. The checkpoint may have been saved incompletely or the file was corrupted during storage." ++ ++ # Distributed process group not initialized for checkpoint save ++ - exact: "Distributed process group is not initialized. Cannot save checkpoint." ++ exception: CheckpointError ++ error_details: "Cannot save checkpoint because the distributed process group is not initialized. This typically occurs when the training cluster encountered communication issues before checkpoint saving could complete. This is a transient infrastructure issue - please try running your training job again." ++ ++ # Megatron core state not initialized for checkpoint save ++ - exact: "Megatron core state or model is not initialized. Cannot save checkpoint." ++ exception: CheckpointError ++ error_details: "Cannot save checkpoint because the Megatron model state is not initialized. This typically occurs when the model failed to load or initialize correctly before training could produce a checkpoint. Please verify the base model is valid and try again." ++ ++ # HF checkpoint already exists ++ - regex: "^HF checkpoint already exists at .+\\. Delete it to run or set overwrite=True\\.$" ++ exception: CheckpointError ++ error_details: "The HuggingFace checkpoint output directory already exists from a previous training run or conversion. This typically happens when a previous training job left partial output behind. Please use a clean output directory, or contact your administrator to remove the existing checkpoint." ++ ++ # =========================================================================== ++ # 11. CUDA/GPU ERRORS (500) ++ # =========================================================================== ++ ++ # --- NeMo-RL --- ++ # Disk space exhausted - occurs in Ray cluster workers during RL training ++ # Ray stores session logs in /tmp/ray/session_*/logs/ which can fill up ephemeral node storage ++ - contains: "No space left on device" ++ exception: DistributedError ++ error_details: "Disk space exhausted on the node's ephemeral storage (/tmp). During reinforcement learning training (DPO/GRPO), Ray stores session logs and temporary files in /tmp/ray/ which can fill up the node's local disk. This is separate from the PVC used for checkpoints and datasets. This is typically a transient infrastructure issue - please try running your training job again, or contact your administrator to ensure adequate ephemeral storage is configured for the cluster nodes." ++ ++ # CUDA out of memory - catch by type name ++ - type_name: OutOfMemoryError ++ exception: CudaError ++ error_details: "GPU out of memory. To reduce memory usage: 1) Lower batch_size, 2) Reduce max_seq_length, 3) Use LoRA/PEFT instead of all_weights fine-tuning, or 4) Use a model with fewer parameters." ++ ++ # CUDA OOM - catch by message pattern ++ - contains: "CUDA out of memory" ++ exception: CudaError ++ error_details: "GPU out of memory. To reduce memory usage: 1) Lower batch_size, 2) Reduce max_seq_length, 3) Use LoRA/PEFT instead of all_weights fine-tuning, or 4) Use a model with fewer parameters." ++ ++ # OOM keyword ++ - contains: "out of memory" ++ exception: CudaError ++ error_details: "GPU out of memory. To reduce memory usage: 1) Lower batch_size, 2) Reduce max_seq_length, 3) Use LoRA/PEFT instead of all_weights fine-tuning, or 4) Use a model with fewer parameters." ++ ++ # General CUDA errors ++ - and: ++ - any_keywords: ["CUDA", "cuda"] ++ - any_keywords: ["error", "Error", "failed", "Failed"] ++ exception: CudaError ++ error_details: "A GPU/CUDA error occurred. Please check GPU availability, ensure the GPU is not being used by another process, and try again." ++ ++ # =========================================================================== ++ # 12. DISTRIBUTED ERRORS (500) ++ # =========================================================================== ++ ++ # --- Automodel --- ++ # torch.distributed not available ++ - exact: "torch.distributed not available" ++ exception: DistributedError ++ error_details: "The PyTorch distributed package is not available in the training environment. Distributed training requires PyTorch to be built with distributed support enabled. This is an issue with the training environment setup, please contact the administrator to ensure the correct PyTorch version is installed." ++ ++ # torch.distributed not initialized ++ - exact: "expected torch.distributed to be initialized" ++ exception: DistributedError ++ error_details: "PyTorch distributed training was not properly initialized before the training process started. This typically happens when the training script is not launched correctly with the distributed launcher (torchrun). This is an issue with the training environment setup, please contact the administrator." ++ ++ # Distributed timeout - check for TimeoutError in cause chain ++ - cause: ++ type_name: TimeoutError ++ recursive: true ++ exception: DistributedError ++ error_details: "A distributed training operation timed out while waiting for communication between GPUs or nodes. This can happen when: 1) One or more GPU workers crashed or became unresponsive, 2) Network connectivity issues between nodes, 3) Uneven workload causing some GPUs to wait too long for others. This may be a transient issue - please try running your training job again. If the problem persists, contact your administrator." ++ ++ # NCCL errors ++ - any_keywords: ["NCCL", "nccl"] ++ exception: DistributedError ++ error_details: "An NCCL (NVIDIA Collective Communications Library) error occurred during GPU-to-GPU communication. NCCL is used to synchronize data between GPUs during distributed training. Common causes include: 1) Network connectivity issues between GPU nodes, 2) GPU hardware problems, 3) Incompatible NCCL versions, or 4) Memory pressure on GPUs. This may be a transient issue - please try running your training job again. If the problem persists, contact your administrator." ++ ++ # c10d errors ++ - contains: "c10d" ++ exception: DistributedError ++ error_details: "A PyTorch distributed communication error occurred (c10d is PyTorch's distributed communication backend). This indicates a failure in the inter-process or inter-node communication during distributed training. This may be caused by network issues, process crashes, or resource exhaustion. Please try running your training job again. If the problem persists, contact your administrator." ++ ++ # --- NeMo-RL --- ++ # Not enough GPUs ++ - and: ++ - type_name: ResourceInsufficientError ++ - contains: "Not enough GPUs available" ++ exception: DistributedError ++ error_details: "The training cluster does not have enough GPUs available for your requested configuration. Your training job requires more GPUs than are currently available in the cluster. Try reducing the parallelism settings (tensor_parallel_size, pipeline_parallel_size) to require fewer GPUs." ++ ++ # Not enough CPUs ++ - and: ++ - type_name: ResourceInsufficientError ++ - contains: "Not enough CPUs available" ++ exception: DistributedError ++ error_details: "The training cluster does not have enough CPUs available for your requested configuration. CPUs are needed for data loading and preprocessing alongside GPU training." ++ ++ # Maximum retries reached ++ - and: ++ - type_name: ResourceInsufficientError ++ - contains: "Maximum number of retries reached" ++ exception: DistributedError ++ error_details: "Failed to allocate cluster resources after multiple retry attempts. This is typically a transient issue - please wait a few minutes and try submitting your training job again. If the problem persists, contact your administrator to check cluster health." ++ ++ # Placement group timeout ++ - contains: "Timed out waiting for placement groups to be ready" ++ exception: DistributedError ++ error_details: "Timed out while waiting for Ray placement groups to be allocated. Placement groups are used to co-locate GPU workers on the same nodes for efficient communication. This typically happens when the cluster is under heavy load and cannot allocate the required resources in time. Please try submitting your training job again. If the problem persists, contact your administrator." ++ ++ # No valid placement groups ++ - contains: "No valid placement groups found" ++ exception: DistributedError ++ error_details: "No valid Ray placement groups could be found for the training job. This indicates a problem with the distributed training cluster configuration or resource availability. This is an infrastructure issue - please contact your administrator to investigate the cluster setup." ++ ++ # Workers per node mismatch ++ - regex: "^workers_per_node list length \\(\\d+\\) must match" ++ exception: DistributedError ++ error_details: "The workers-per-node configuration does not match the number of placement groups allocated. This indicates an internal mismatch in the distributed training setup. This is an infrastructure issue - please contact your administrator." ++ ++ # Missing sharding annotations ++ - exact: "Sharding annotations must be provided to use sharded data distribution" ++ exception: DistributedError ++ error_details: "The training configuration requires sharded data distribution but sharding annotations are not provided. Sharding annotations specify how data should be distributed across workers for efficient parallel processing. This is an internal configuration issue - please contact your administrator." ++ ++ # =========================================================================== ++ # 13. GENERATION ERRORS (500) ++ # NeMo-RL ++ # =========================================================================== ++ ++ # Weight update failed during refit ++ - regex: "^Updating weights for the generation policy failed during refit" ++ exception: GenerationError ++ error_details: "Failed to update the vLLM generation model weights from the training policy during the 'refit' step. In GRPO training, the generation model periodically syncs weights from the training model. This failure may be caused by: 1) CUDA IPC (Inter-Process Communication) issues between training and generation workers, 2) NCCL communication errors, or 3) Memory pressure on GPUs. This is typically a transient issue - please try running your training job again." ++ ++ # generate_text with async_engine ++ - contains: "generate_text cannot be used with async_engine=True" ++ exception: GenerationError ++ error_details: "A synchronous generation method was called on an async vLLM engine. When async_engine is enabled, you must use async methods (e.g., generate_text_async). This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # update_weights_via_ipc with async_engine ++ - contains: "cannot be used with async_engine=True" ++ exception: GenerationError ++ error_details: "A synchronous method was called on an async vLLM engine. When async_engine is enabled, all vLLM operations must use their async variants. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # Error in sample rollout ++ - regex: "^Error in sample \\d+ rollout: .+$" ++ exception: GenerationError ++ error_details: "An error occurred while generating a response (rollout) for one of the training samples during GRPO training. Rollouts are the model-generated responses used to compute rewards and policy gradients. This may be caused by: 1) Invalid input data in the sample, 2) Generation parameters causing issues (e.g., max_tokens too low), or 3) vLLM backend errors. Check your dataset for problematic samples." ++ ++ # Async generation not enabled ++ - contains: "Async generation is not enabled" ++ exception: GenerationError ++ error_details: "Async generation was requested but the vLLM engine is not configured with async_engine=True. Async generation allows overlapping training and generation for better throughput. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # NeMo-Gym/Penguin requires async vLLM ++ - contains: "you must use vllm generation backend with" ++ exception: GenerationError ++ error_details: "The NeMo-Gym (Penguin) environment requires the vLLM generation backend with async_engine enabled. NeMo-Gym provides advanced RL training features that depend on async generation. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # NeMo-Gym/Penguin requires HTTP server ++ - contains: "expose the vllm server via" ++ exception: GenerationError ++ error_details: "The NeMo-Gym (Penguin) environment requires the vLLM server to be exposed via HTTP (expose_http_server: true). This allows the environment to communicate with the generation model through an HTTP API. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # NeMo-Gym/Penguin incompatible with reasoning parser ++ - contains: "Please do not use a reasoning parser in vLLM" ++ exception: GenerationError ++ error_details: "The NeMo-Gym (Penguin) environment is incompatible with vLLM's reasoning parser. NeMo-Gym handles all data processing including reasoning traces itself, so having a reasoning parser in vLLM would cause conflicts. This is an internal configuration issue with the training environment, please contact the administrator." ++ ++ # No placement groups available for vLLM ++ - exact: "No placement groups available in the cluster" ++ exception: GenerationError ++ error_details: "No Ray placement groups are available for vLLM generation workers. This means the cluster could not allocate the required GPU resources for the generation component of training. This is typically a resource availability issue - please try again or contact your administrator to check cluster capacity." ++ ++ # Unable to allocate vLLM worker groups ++ - contains: "Unable to allocate any worker groups with the available resources" ++ exception: GenerationError ++ error_details: "Could not allocate any vLLM worker groups with the available cluster resources. The generation component of DPO/GRPO training requires dedicated GPU resources for vLLM inference workers. Please ensure the cluster has enough GPUs, or reduce the generation parallelism settings." ++ ++ # Placement group contains no bundles ++ - exact: "Placement group contains no bundles" ++ exception: GenerationError ++ error_details: "A Ray placement group allocated for vLLM generation workers contains no resource bundles. This indicates an issue with cluster resource allocation. This is an infrastructure issue - please contact your administrator." ++ ++ # Failed to retrieve bundle/node mapping from placement group ++ - contains: "Failed to retrieve bundle/node mapping from placement group" ++ exception: GenerationError ++ error_details: "Could not retrieve the bundle-to-node mapping from the Ray placement group for vLLM workers. This indicates an issue with the distributed training cluster setup. This is an infrastructure issue - please contact your administrator." ++ ++ # No output received for generation request ++ - regex: "^No output received for request .+$" ++ exception: GenerationError ++ error_details: "The vLLM async generation engine did not produce any output for a generation request. This can happen when: 1) The generation request timed out, 2) The vLLM worker encountered an internal error, or 3) GPU memory was exhausted during generation. This is typically a transient issue - please try running your training job again." ++ ++ # =========================================================================== ++ # 14. INTERNAL ERRORS (500) ++ # =========================================================================== ++ ++ # --- Automodel Pipeline Parallelism Errors --- ++ # Pipeline parallelism: first stage missing inputs ++ - exact: "You must provide either input_ids or inputs_embeds" ++ exception: InternalError ++ error_details: "Pipeline parallelism internal error: the first pipeline stage did not receive input data (input_ids or inputs_embeds). This is an internal configuration issue with how the model is split across pipeline stages, please reach out to the NeMo Platform team." ++ ++ # Pipeline parallelism: intermediate stage missing embeddings ++ - exact: "inputs_embeds must be provided for pipeline stages without embed_tokens" ++ exception: InternalError ++ error_details: "Pipeline parallelism internal error: an intermediate pipeline stage did not receive embeddings from the previous stage. In pipeline parallelism, each stage processes a portion of the model layers and passes activations to the next stage. This error indicates the inter-stage communication failed, and is an internal training configuration issue, please reach out to the NeMo Platform team." ++ ++ # --- Automodel MoE (Mixture of Experts) Errors --- ++ # MoE: only 1D mesh supported (occurs when TP+EP are both > 1) ++ - exact: "We only support 1D mesh for MoE" ++ exception: ParallelismConfigError ++ error_details: "MoE (Mixture of Experts) models do not support combining tensor parallelism with expert parallelism. When using expert_model_parallel_size > 1, you must set tensor_parallel_size=1. Please update your parallelism configuration to disable tensor parallelism for MoE training." ++ ++ # MoE: DTensor placement error (checkpoint/parallelism mismatch) ++ - contains: "has unsupported DTensor placement" ++ exception: ParallelismConfigError ++ error_details: "MoE (Mixture of Experts) model checkpoint has an incompatible tensor distribution for the current expert parallelism settings. This typically occurs when the base model checkpoint was saved with different expert_model_parallel_size than what you're using for training. Please ensure your expert_model_parallel_size matches how the base model was originally distributed, or use a checkpoint that was saved without expert parallelism (expert_model_parallel_size=1)." ++ ++ # --- Automodel Fused Optimization Errors --- ++ # FusedLinearCrossEntropy configuration ++ - contains: "FusedLinearCrossEntropy requires the model to output hidden states" ++ exception: InternalError ++ error_details: "The fused linear cross-entropy optimization requires the model to output hidden states, but the model is configured to only output logits. FusedLinearCrossEntropy is a memory optimization that combines the final linear projection and loss computation. This is an internal configuration issue, contact the NeMo Platform team." ++ ++ # --- NeMo-RL Async GRPO Errors --- ++ # Stale trajectories in replay buffer ++ - regex: "^Found \\d+ trajectories older than min_valid_version \\d+$" ++ exception: InternalError ++ error_details: "The async GRPO replay buffer contains stale trajectories that are older than the minimum valid version. In async GRPO, trajectories are generated asynchronously and stored in a replay buffer. Stale trajectories can cause training instability because they were generated by an outdated policy. This indicates a synchronization issue between generation and training workers. Please contact the administrator." ++ ++ # --- NeMo-RL Tensor Processing Errors --- ++ # Tensor dimension mismatch ++ - regex: "^tensors for .+ must have same number of dimensions" ++ exception: InternalError ++ error_details: "Tensors being processed have mismatched dimensions during internal batching. This is an internal data processing issue that should not occur with valid datasets. Please contact the NeMo Platform team with your dataset format details." ++ ++ # Tensor dtype mismatch ++ - contains: "expected consistent types but got:" ++ exception: InternalError ++ error_details: "Tensors being processed have inconsistent data types (dtypes) during internal batching. This is an internal data processing issue that should not occur with valid datasets. Please contact the NeMo Platform team." ++ ++ # Tensors on different devices ++ - contains: "expected tensors on the same device but got:" ++ exception: InternalError ++ error_details: "Tensors are located on different devices during internal processing. This is an internal distributed training issue. Please contact the NeMo Platform team." ++ ++ # --- Automodel Configuration Errors --- ++ # Config instantiation failure (from ConfigNode.instantiate()) ++ # This prints a detailed error with "Instantiation failed for `func_name`" ++ - contains: "Instantiation failed for" ++ exception: InternalError ++ error_details: "Failed to instantiate a training configuration component. The training system uses a configuration tree where each node can instantiate Python objects (like optimizers, schedulers, or model components). This error means one of these instantiations failed, possibly due to invalid parameters or missing dependencies. Please contact the administrator." ++ ++ # Model compilation failure ++ - contains: "Model compilation failed" ++ exception: InternalError ++ error_details: "PyTorch model compilation (torch.compile) failed. Model compilation is an optional optimization that can speed up training by compiling the model graph. Training will fall back to eager mode and continue without compilation. If this error persists, it may indicate an incompatibility between the model architecture and PyTorch's compiler. Please contact the administrator if training fails." ++ ++ # --- General Training Process Errors --- ++ # Training subprocess error (generic fallback when no specific error was parsed) ++ # Matches both parser ("Training failed with exit code: X") and train.py ("Training subprocess returned with error code: X") ++ - regex: "^Training (failed with exit code|subprocess returned with (?:error )?code):? \\d+.*" ++ exception: InternalError ++ error_details: "The training process exited with a non-zero exit code, but no specific error message could be extracted from the training logs. This is a generic failure that can have many causes. Please check the full training logs for more details, and contact the administrator if you cannot determine the cause." ++ +diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/exceptions.py b/services/automodel/src/nmp/automodel/tasks/training/errors/exceptions.py +new file mode 100644 +index 000000000..6a9852705 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/errors/exceptions.py +@@ -0,0 +1,431 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Custom exceptions for Customizer training errors. ++ ++These exceptions provide user-friendly error messages for errors that may occur ++during training with various backends: ++- Automodel ++- NeMo-RL ++- Megatron Bridge ++""" ++ ++from dataclasses import dataclass ++from typing import TypedDict ++ ++ ++def format_exception_string(exc: BaseException) -> str: ++ """Format an exception as ``TypeName: message`` matching Python's traceback style. ++ ++ This is the canonical format used throughout the error-handling pipeline: ++ - ``ray_bootstrap`` writes it into the driver output buffer so the parser ++ can extract exceptions that occurred outside the subprocess. ++ - ``default_exception_handler`` uses it for the ``detail`` field reported ++ to the Jobs service. ++ - The parser's ``_EXCEPTION_RE`` regex is designed to match this format ++ when reading subprocess output. ++ """ ++ return f"{type(exc).__name__}: {exc}" ++ ++ ++class ErrorDetails(TypedDict): ++ """Error details dict for Jobs service reporting.""" ++ ++ message: str ++ type: str ++ detail: str | None ++ ++ ++@dataclass ++class CustomizerTrainingError(Exception): ++ """ ++ Base exception for Customizer training errors. ++ ++ Attributes: ++ message: User-friendly error message shown to the user. ++ detail: Technical details about the original error (for debugging). ++ user_message: Class-level default message used as fallback when the YAML rule ++ does not specify an `error_details` field. Subclasses override this. ++ """ ++ ++ message: str ++ detail: str | None = None ++ ++ # Default user-facing message - subclasses override this. ++ # Used as fallback when YAML rule omits `error_details` field. ++ # See default_exception_handler() for usage. ++ user_message: str = "An error occurred during training." ++ ++ def __post_init__(self): ++ # Call Exception.__init__ with the message ++ super().__init__(self.message) ++ ++ def __str__(self) -> str: ++ return self.message ++ ++ def to_error_details(self) -> ErrorDetails: ++ """Convert to error_details dict for Jobs service reporting.""" ++ return ErrorDetails( ++ message=self.message, ++ type=type(self).__name__, ++ detail=self.detail, ++ ) ++ ++ ++# ============================================================================= ++# CLIENT ERRORS (400) ++# ============================================================================= ++ ++ ++@dataclass ++class DatasetFormatError(CustomizerTrainingError): ++ """ ++ Dataset has invalid format or schema. ++ ++ Raised when: ++ - Dataset sample has unsupported role (not system/user/assistant/tool) ++ - Dataset is empty or has zero valid samples ++ - Text input is not a string or list of strings ++ - Required field missing from dataset sample ++ - Prompt file does not exist ++ """ ++ ++ user_message: str = "Dataset format error. Please check your dataset matches the expected schema." ++ ++ ++@dataclass ++class TrainingConfigError(CustomizerTrainingError): ++ """ ++ Invalid training configuration. ++ ++ Raised when: ++ - Model incompatible with pipeline parallelism (tied embeddings, encoder-decoder) ++ - PP batch/microbatch configuration invalid ++ - Model doesn't support SDPA for context parallelism ++ - Triton not installed for optimized LoRA kernels ++ - LoRA adapter dimensions mismatch ++ - DPO with dynamic batching or sequence packing ++ - GRPO missing generation config or validation dataset ++ - Async GRPO configuration errors ++ - Batch size not divisible by data parallel size ++ - World size insufficient for parallelism configuration ++ """ ++ ++ user_message: str = ( ++ "Training configuration error. Please check your parallelism settings " ++ "(tensor_parallel_size, pipeline_parallel_size, expert_model_parallel_size), " ++ "batch settings (batch_size, micro_batch_size), or training type configuration." ++ ) ++ ++ ++@dataclass ++class TrainingEnvironmentError(CustomizerTrainingError): ++ """ ++ Invalid environment configuration for GRPO. ++ ++ Raised when: ++ - GRPO environment name is not recognized ++ - GRPO environment not configured ++ - No environment found for task type ++ """ ++ ++ user_message: str = "Environment configuration error. Please check your GRPO environment settings." ++ ++ ++@dataclass ++class ParallelismConfigError(CustomizerTrainingError): ++ """ ++ Invalid parallelism configuration for MoE models. ++ ++ Raised when: ++ - MoE model uses tensor parallelism with expert parallelism (only 1D mesh supported) ++ - DTensor placement incompatible with expert parallelism settings ++ - Checkpoint parallelism settings don't match training configuration ++ """ ++ ++ user_message: str = ( ++ "Parallelism configuration error for Mixture-of-Experts (MoE) model. " ++ "MoE models do not support combining tensor_parallel_size > 1 with expert_model_parallel_size > 1. " ++ "To fix: either set tensor_parallel_size=1 when using expert parallelism, " ++ "or set expert_model_parallel_size=1 when using tensor parallelism." ++ ) ++ ++ ++# ============================================================================= ++# NOT FOUND ERRORS (404) ++# ============================================================================= ++ ++ ++@dataclass ++class ModelNotFoundError(CustomizerTrainingError): ++ """ ++ Model or checkpoint path doesn't exist. ++ ++ Raised when: ++ - The specified checkpoint path does not exist ++ - The checkpoint directory is empty when resuming ++ - Nemotron model missing required HF source code ++ """ ++ ++ user_message: str = ( ++ "Model or checkpoint not found. The specified model path does not exist or is inaccessible. " ++ "Please verify the model identifier is correct and the model was successfully downloaded." ++ ) ++ ++ ++# ============================================================================= ++# SERVER ERRORS (500) ++# ============================================================================= ++ ++ ++@dataclass ++class ModelLoadError(CustomizerTrainingError): ++ """ ++ Failed to load or initialize model. ++ ++ Raised when: ++ - Model weights could not be applied to a layer (corruption) ++ - Model optimizations/patches failed ++ - Method signature mismatch during patching ++ - Missing lm_head.weight in model ++ - vLLM library not installed ++ - Shape mismatch for model parameters or buffers ++ - Generation output missing required fields ++ """ ++ ++ user_message: str = ( ++ "Failed to load the model. This can happen when: " ++ "1) The model checkpoint is corrupted or incomplete, " ++ "2) The model architecture is incompatible with the training configuration, " ++ "3) There is a version mismatch between the model and the training framework. " ++ "Please verify the model checkpoint is valid and complete." ++ ) ++ ++ ++@dataclass ++class CheckpointError(CustomizerTrainingError): ++ """ ++ Checkpoint save or load failure. ++ ++ Raised when: ++ - Checkpoint directory already exists ++ - Failed to validate global plan (distributed checkpoint corruption) ++ - Missing key in checkpoint state_dict ++ - Expert weights missing from MoE checkpoint ++ - Training interrupted during checkpoint save ++ - Parallelism settings don't match checkpoint ++ - Model export or upload failed ++ """ ++ ++ user_message: str = ( ++ "Checkpoint save or load failed. This can happen when: " ++ "1) The checkpoint is corrupted or was saved incompletely (e.g., training was interrupted), " ++ "2) Disk space is insufficient for saving checkpoints, " ++ "3) The base model checkpoint is incompatible with the current training configuration." ++ ) ++ ++ ++@dataclass ++class CudaError(CustomizerTrainingError): ++ """ ++ GPU/CUDA runtime error. ++ ++ Raised when: ++ - GPU out of memory (OOM) ++ - General CUDA runtime errors ++ """ ++ ++ user_message: str = ( ++ "GPU memory exhausted. To reduce memory usage: " ++ "1) Reduce batch_size or micro_batch_size, " ++ "2) Reduce max_seq_length, " ++ "3) Use LoRA fine-tuning instead of full fine-tuning, " ++ "4) Increase tensor_parallel_size to distribute the model across more GPUs." ++ ) ++ ++ ++@dataclass ++class DistributedError(CustomizerTrainingError): ++ """ ++ Distributed training or Ray cluster failure. ++ ++ Raised when: ++ - torch.distributed not available ++ - torch.distributed not initialized ++ - Distributed operation timeout ++ - NCCL communication errors ++ - Ray cluster resource insufficiency ++ - Placement group allocation failure ++ """ ++ ++ user_message: str = "Distributed training error. Please check cluster resources and try again." ++ ++ ++@dataclass ++class GenerationError(CustomizerTrainingError): ++ """ ++ vLLM generation/inference failure. ++ ++ Raised when: ++ - Failed to update vLLM weights from training policy ++ - Sync method called on async engine ++ - Error during rollout for a sample ++ - Async generation called without async engine ++ - Penguin requires async vLLM ++ """ ++ ++ user_message: str = ( ++ "Generation error during reinforcement learning training. " ++ "DPO and GRPO training generate model responses during the training loop to compute rewards. " ++ "This error indicates the generation step failed, which may be caused by vLLM backend issues " ++ "or incompatible generation settings." ++ ) ++ ++ ++@dataclass ++class TrainingTimeoutError(CustomizerTrainingError): ++ """ ++ Training exceeded time limit. ++ ++ Raised when: ++ - Training subprocess exceeded configured timeout ++ """ ++ ++ user_message: str = ( ++ "Training exceeded the maximum allowed time limit. " ++ "To reduce training time: reduce epochs or max_steps, use a smaller dataset, " ++ "use a smaller model, or use LoRA fine-tuning instead of full fine-tuning. " ++ "Contact your administrator if you need longer training time limits." ++ ) ++ ++ ++@dataclass ++class InternalError(CustomizerTrainingError): ++ """ ++ Unexpected internal error. ++ ++ Raised when: ++ - Pipeline stage missing input_ids or inputs_embeds ++ - MoE device mesh configuration error ++ - DTensor placement error for expert parallelism ++ - FusedLinearCrossEntropy configuration error ++ - Tensor dimension/dtype/device mismatch ++ - Logger misconfiguration ++ - Any unmatched error (fallback) ++ """ ++ ++ user_message: str = ( ++ "An unexpected internal error occurred during training. " ++ "This is typically caused by framework-level issues such as tensor misconfigurations, " ++ "device mesh errors, or internal pipeline failures. " ++ "Please try running your job again. If the issue persists, contact your administrator " ++ "with the job ID and error details for further investigation." ++ ) ++ ++ ++@dataclass ++class GenericTrainingError(CustomizerTrainingError): ++ """ ++ Fallback when error classification is ambiguous. ++ ++ Used when multiple error rules match the same exception, ++ making classification unreliable. ++ """ ++ ++ user_message: str = ( ++ "Training failed due to an error that could not be precisely categorized. " ++ "Please review the error details for more information. " ++ "If the issue persists, try adjusting your training configuration." ++ ) ++ ++ ++# ============================================================================= ++# EXCEPTION REGISTRY ++# ============================================================================= ++ ++# Maps exception class names (strings in YAML) to actual Python classes ++EXCEPTION_REGISTRY: dict[str, type[Exception]] = { ++ # Base ++ "CustomizerTrainingError": CustomizerTrainingError, ++ # Client errors (400) ++ "DatasetFormatError": DatasetFormatError, ++ "TrainingConfigError": TrainingConfigError, ++ "TrainingEnvironmentError": TrainingEnvironmentError, ++ "ParallelismConfigError": ParallelismConfigError, ++ # Not found (404) ++ "ModelNotFoundError": ModelNotFoundError, ++ # Server errors (500) ++ "ModelLoadError": ModelLoadError, ++ "CheckpointError": CheckpointError, ++ "CudaError": CudaError, ++ "DistributedError": DistributedError, ++ "GenerationError": GenerationError, ++ "TrainingTimeoutError": TrainingTimeoutError, ++ "InternalError": InternalError, ++ "GenericTrainingError": GenericTrainingError, ++} ++ ++ ++# ============================================================================= ++# DEFAULT EXCEPTION HANDLER ++# ============================================================================= ++ ++ ++def default_exception_handler( ++ exception_class: type[Exception], ++ original_exception: Exception, ++ error_details: str | None, ++) -> Exception: ++ """ ++ Default handler for creating Customizer training exceptions. ++ ++ This handler is used by RulesLoader when: ++ 1. A rule matches but doesn't have a custom handler ++ 2. No rule matches and fallback_exception is set ++ ++ Args: ++ exception_class: The exception class to create (from EXCEPTION_REGISTRY) ++ original_exception: The original exception that was caught ++ error_details: User-friendly message from the rule's error_details field, ++ or None if not specified ++ ++ Returns: ++ A new instance of exception_class with appropriate message and detail ++ """ ++ # Get the default user message from the class if no error_details provided ++ if issubclass(exception_class, CustomizerTrainingError): ++ user_message = error_details or exception_class.user_message ++ # For InternalError fallback (no matching rule), include the original error ++ # in the message so users get actionable information instead of a vague message ++ if exception_class is InternalError and error_details is None: ++ user_message = f"{user_message} ({format_exception_string(original_exception)})" ++ return exception_class( ++ message=user_message, ++ detail=format_exception_string(original_exception), ++ ) ++ else: ++ # For non-CustomizerTrainingError classes (shouldn't happen, but be safe) ++ return exception_class(error_details or str(original_exception)) ++ ++ ++__all__ = [ ++ "CheckpointError", ++ "CudaError", ++ "CustomizerTrainingError", ++ "DatasetFormatError", ++ "DistributedError", ++ "ErrorDetails", ++ "EXCEPTION_REGISTRY", ++ "format_exception_string", ++ "GenerationError", ++ "GenericTrainingError", ++ "InternalError", ++ "ModelLoadError", ++ "ModelNotFoundError", ++ "ParallelismConfigError", ++ "TrainingConfigError", ++ "TrainingEnvironmentError", ++ "TrainingTimeoutError", ++ "default_exception_handler", ++] +diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/parser.py b/services/automodel/src/nmp/automodel/tasks/training/errors/parser.py +new file mode 100644 +index 000000000..e0a4ee217 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/errors/parser.py +@@ -0,0 +1,255 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Error parser for subprocess output. ++ ++This module provides utilities to parse and extract meaningful error messages from ++training subprocess output (stdout/stderr). It should be used by all training backends ++(Automodel, NeMo-RL, Megatron Bridge) to capture errors for classification. ++ ++The extracted error messages are then matched against YAML rules by the ++error converter to produce user-friendly error messages. ++""" ++ ++import re ++import subprocess ++import sys ++from collections import deque ++from dataclasses import dataclass ++ ++# Number of recent output lines to keep for error parsing ++MAX_OUTPUT_LINES = 500 ++ ++# Patterns that indicate an error line (case-insensitive search) ++# These match Python exception types and common error patterns from training libraries ++ERROR_INDICATORS = [ ++ # Python exception type names (appear as "ExceptionType: message") ++ "runtimeerror", ++ "valueerror", ++ "assertionerror", ++ "importerror", ++ "attributeerror", ++ "keyerror", ++ "typeerror", ++ "filenotfounderror", ++ "permissionerror", ++ "oserror", ++ "ioerror", ++ # Generic error patterns ++ "error:", ++ "exception:", ++ "traceback", ++ # Automodel-specific patterns ++ "instantiation failed", # From ConfigNode.instantiate() ++ "model compilation failed", # From compile_utils.py ++ # NeMo-RL patterns ++ "ray error", ++ "actor died", ++ "worker crashed", ++ # Megatron Bridge patterns ++ "nemo error", ++ "lightning error", ++ # CUDA/GPU patterns ++ "cuda out of memory", ++ "out of memory", ++ "oom", ++ "cuda error", ++ "cublas error", ++ "cudnn error", ++ # Distributed training patterns ++ "nccl", ++ "gloo", ++ "distributed", ++ "mpi error", ++ # General failure patterns ++ "failed", ++ "failure", ++ "abort", ++ "killed", ++ "segmentation fault", ++ "signal", ++] ++ ++# Regex to detect Python exception lines ("SomeError: message") and extract ++# the type name (group 1) and message (group 2) as separate captures. ++_EXCEPTION_RE = re.compile( ++ r"\b(\w*(?:Error|Exception)):\s*(.*)", ++ re.IGNORECASE, ++) ++ ++# Wrapper exceptions from distributed training - skip these to find root cause ++WRAPPER_EXCEPTION_PATTERNS = [ ++ "childfailederror", # torch.distributed wrapper ++ "torch.distributed.elastic", # torch elastic wrapper ++ "multiprocessing.errors", # multiprocessing wrapper ++] ++ ++ ++@dataclass(frozen=True) ++class ParsedError: ++ """Error extracted from subprocess output. ++ ++ Preserves both the original exception type name (as printed in the ++ traceback) and the message, so callers can reconstruct a typed ++ exception for the converter's type-based matchers. ++ """ ++ ++ exception_type: str ++ message: str ++ ++ def to_exception(self) -> Exception: ++ """Reconstruct an exception that preserves the original type name. ++ ++ Dynamically creates an exception class whose ``__name__`` matches ++ the original type (e.g. ``ValueError``, ``ResourceInsufficientError``) ++ so that ``type_name`` YAML matchers can match it. The class inherits ++ from ``RuntimeError`` so that standard ``except Exception`` handling ++ works without needing the real library class to be importable. ++ """ ++ exc_class = type(self.exception_type, (RuntimeError,), {}) ++ return exc_class(self.message) ++ ++ ++def _clean_line(line: str) -> str: ++ """Remove common prefixes like [rank0]: from distributed output.""" ++ line = re.sub(r"^\[rank\d+\]:\s*", "", line.strip()) ++ return line.strip() ++ ++ ++def _is_wrapper_exception(line: str) -> bool: ++ """Check if this is a wrapper exception that should be skipped.""" ++ line_lower = line.lower() ++ return any(pattern in line_lower for pattern in WRAPPER_EXCEPTION_PATTERNS) ++ ++ ++def _extract_exception(line: str) -> ParsedError | None: ++ """ ++ Extract the exception type and message from a subprocess output line. ++ ++ Examples: ++ >>> _extract_exception("[rank0]: ValueError: invalid input") ++ ParsedError(exception_type='ValueError', message='invalid input') ++ >>> _extract_exception("torch.cuda.OutOfMemoryError: CUDA OOM") ++ ParsedError(exception_type='OutOfMemoryError', message='CUDA OOM') ++ >>> _extract_exception(" File 'train.py', line 42") ++ None ++ >>> _extract_exception("ChildFailedError: worker 0 failed") ++ None # Wrapper exception, skipped ++ ++ Returns None for non-exception lines and wrapper exceptions. ++ """ ++ if _is_wrapper_exception(line): ++ return None ++ ++ match = _EXCEPTION_RE.search(line) ++ if match: ++ exc_type = match.group(1).strip() ++ message = match.group(2).strip() if match.group(2) else "" ++ return ParsedError( ++ exception_type=exc_type, ++ message=message or exc_type, ++ ) ++ ++ return None ++ ++ ++def parse_error_from_output(output_lines: deque, returncode: int) -> ParsedError: ++ """ ++ Parse subprocess output and extract a structured error. ++ ++ Searches the captured output for Python exception lines and returns a ++ ``ParsedError`` preserving both the exception type name and message. ++ Callers use ``result.to_exception()`` to reconstruct a typed exception ++ that works with both message-based *and* type-based YAML matchers. ++ ++ Strategy: ++ 1. Find the LAST Python exception line (e.g., "ValueError: message") ++ 2. Extract the type name and message separately ++ 3. Deduplicate across distributed ranks ++ ++ Args: ++ output_lines: Rolling buffer of recent output lines. ++ returncode: Process exit code. ++ ++ Returns: ++ ParsedError with exception_type and message. ++ """ ++ if not output_lines: ++ return ParsedError("RuntimeError", f"Training failed with exit code: {returncode}") ++ ++ lines = list(output_lines) ++ ++ # Search backwards for exception lines and collect unique ones ++ # (distributed training often prints the same error multiple times) ++ found: list[ParsedError] = [] ++ seen_messages: set[str] = set() ++ ++ for i in range(len(lines) - 1, -1, -1): ++ parsed = _extract_exception(lines[i]) ++ if parsed and parsed.message not in seen_messages: ++ seen_messages.add(parsed.message) ++ found.append(parsed) ++ if len(found) >= 3: ++ break ++ ++ if found: ++ return found[0] ++ ++ # Fallback: search for any error-related lines ++ error_lines: list[str] = [] ++ for line in reversed(lines): ++ line_lower = line.lower() ++ is_error_line = any(indicator in line_lower for indicator in ERROR_INDICATORS) ++ if is_error_line: ++ cleaned = _clean_line(line) ++ if cleaned and cleaned not in error_lines: ++ error_lines.insert(0, cleaned) ++ if len(error_lines) > 10: ++ break ++ ++ if error_lines: ++ return ParsedError("RuntimeError", "\n".join(error_lines[-10:])) ++ ++ # Last resort: return last N lines of output ++ last_lines = [_clean_line(line) for line in lines[-10:]] ++ message = f"Training failed with exit code {returncode}. Last output:\n" + "\n".join(last_lines) ++ return ParsedError("RuntimeError", message) ++ ++ ++def read_subprocess_output(proc: subprocess.Popen, buffer: deque) -> None: ++ """ ++ Read subprocess output, stream to console, and capture in buffer. ++ ++ This function is designed to run in a daemon thread alongside a subprocess, ++ reading its stdout line-by-line, printing to console in real-time, and ++ storing lines in a rolling buffer for later error extraction. ++ ++ Args: ++ proc: The subprocess.Popen object with stdout=PIPE. ++ buffer: A deque with maxlen to store recent output lines. ++ """ ++ if proc.stdout is None: ++ return ++ ++ try: ++ for line in iter(proc.stdout.readline, ""): ++ if not line: ++ break ++ # Stream to console ++ sys.stdout.write(line) ++ sys.stdout.flush() ++ # Capture in rolling buffer ++ buffer.append(line.rstrip("\n")) ++ except (ValueError, OSError): ++ # Process closed or pipe broken ++ pass ++ ++ ++__all__ = [ ++ "ERROR_INDICATORS", ++ "MAX_OUTPUT_LINES", ++ "ParsedError", ++ "parse_error_from_output", ++ "read_subprocess_output", ++] +diff --git a/services/automodel/src/nmp/automodel/tasks/training/integrations.py b/services/automodel/src/nmp/automodel/tasks/training/integrations.py +new file mode 100644 +index 000000000..f3610c9d1 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/integrations.py +@@ -0,0 +1,168 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++# ++# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual ++# property and proprietary rights in and to this material, related ++# documentation and any modifications thereto. Any use, reproduction, ++# disclosure or distribution of this material and related documentation ++# without an express license agreement from NVIDIA CORPORATION or ++# its affiliates is strictly prohibited. ++ ++"""WandB and MLflow config helpers for Automodel training.""" ++ ++import logging ++import os ++from pathlib import Path ++from typing import Any ++ ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nmp.automodel.tasks.training.schemas import TrainingStepConfig ++ ++logger = logging.getLogger(__name__) ++ ++ ++def _resolve_with_fallback( ++ primary: str | None, ++ fallback: str | None, ++ default: str, ++ field_label: str | None = None, ++) -> str: ++ """Pick the first truthy value from *primary* → *fallback* → *default*. ++ ++ When *field_label* is given and neither *primary* nor *fallback* is set, ++ a warning is logged so operators know a hardcoded default is in use. ++ """ ++ if field_label and not (primary or fallback): ++ logger.warning(f"{field_label} is not set; using fallback '{default}'.") ++ return primary or fallback or default ++ ++ ++def build_mlflow_config( ++ customizer_config: TrainingStepConfig, ++ job_ctx: NMPJobContext, ++ framework: str, ++) -> dict[str, Any] | None: ++ """Build MLflow config for Automodel training. ++ The resulting dict is passed to MLflow logging setup in the recipe config. ++ ++ Run naming strategy (same as WandB): ++ - run_name uses job_id (stable across pause/resume) ++ - task_id is added to tags for granular execution tracking ++ ++ Missing tracking URI disables integration with a warning. ++ """ ++ user_config = customizer_config.integrations.mlflow ++ if not user_config: ++ return None ++ ++ # User-provided tracking URI takes precedence over environment variable ++ tracking_uri = user_config.tracking_uri or os.environ.get("MLFLOW_TRACKING_URI") ++ if not tracking_uri: ++ logger.warning( ++ "MLflow integration is configured but no tracking URI is set " ++ "(MLFLOW_TRACKING_URI env var and integrations.mlflow.tracking_uri in job POST request are empty); " ++ "MLflow integration will be disabled." ++ ) ++ return None ++ ++ tags: dict[str, str] = { ++ "service": "customizer", ++ "framework": framework, ++ } ++ if job_ctx.workspace: ++ tags["workspace"] = job_ctx.workspace ++ if job_ctx.job_id: ++ tags["job"] = job_ctx.job_id ++ if job_ctx.task: ++ tags["task"] = job_ctx.task ++ if customizer_config.model.name: ++ tags["model_name"] = customizer_config.model.name ++ ++ # User-provided tags override defaults above ++ if user_config.tags: ++ tags.update(user_config.tags) ++ if user_config.description: ++ # MLflow run description is stored in the reserved `mlflow.note.content` tag. ++ # See: https://mlflow.org/docs/latest/ml/tracking/#how-to-include-additional-description-texts-about-the-run ++ tags["mlflow.note.content"] = user_config.description ++ ++ experiment_name = _resolve_with_fallback( ++ user_config.experiment_name, ++ customizer_config.output_model, ++ "default-experiment", ++ field_label="MLflow experiment_name", ++ ) ++ run_name = _resolve_with_fallback( ++ user_config.run_name, ++ job_ctx.job_id, ++ "default-run", ++ field_label="MLflow run_name", ++ ) ++ ++ mlflow_config: dict[str, Any] = { ++ "tracking_uri": tracking_uri, ++ "experiment_name": experiment_name, ++ "run_name": run_name, ++ "tags": tags, ++ } ++ ++ return mlflow_config ++ ++ ++def build_wandb_config( ++ customizer_config: TrainingStepConfig, ++ job_ctx: NMPJobContext, ++ framework: str, ++) -> dict[str, Any] | None: ++ """Build WandB config for Automodel training. ++ ++ The resulting dict is passed to wandb.init() as kwargs by automodel. ++ See: https://docs.wandb.ai/ref/python/init ++ ++ TODO: Add pause/resume support: ++ - 'name' and 'id' use job_id (stable across pause/resume) ++ - 'resume="allow"' enables continuing runs after pause/resume ++ """ ++ user_config = customizer_config.integrations.wandb ++ if not user_config: ++ return None ++ ++ wandb_api_key = os.environ.get("WANDB_API_KEY") ++ if not user_config.base_url and not wandb_api_key: ++ logger.warning("WandB API key is not set and no base_url is provided, skipping WandB integration") ++ return None ++ ++ # Note: This is semantically different from job_ctx.workspace. ++ # This is the workspace for training artifacts. ++ run_dir = Path(customizer_config.workspace_path) / "wandb" ++ ++ tags: list[str] = ["service:customizer", f"framework:{framework}"] ++ if job_ctx.workspace: ++ tags.append(f"workspace:{job_ctx.workspace}") ++ if job_ctx.job_id: ++ tags.append(f"job:{job_ctx.job_id}") ++ if job_ctx.task: ++ tags.append(f"task:{job_ctx.task}") ++ if customizer_config.model.name: ++ tags.append(f"model:{customizer_config.model.name}") ++ # User-provided tags are appended (can override tags above) ++ if user_config.tags: ++ tags.extend(user_config.tags) ++ ++ wandb_config: dict[str, Any] = { ++ "project": _resolve_with_fallback(user_config.project, customizer_config.output_model, "default-project"), ++ "name": _resolve_with_fallback(user_config.name, job_ctx.job_id, "default-run"), ++ "dir": str(run_dir), ++ "tags": tags, ++ } ++ if user_config.entity: ++ wandb_config["entity"] = user_config.entity ++ if user_config.notes: ++ wandb_config["notes"] = user_config.notes ++ if user_config.base_url: ++ # For self-hosted W&B servers, base_url is passed via the settings dict ++ # (wandb.init accepts settings as Union[Settings, Dict[str, Any], None]). ++ logger.info(f"Using self-hosted W&B server: {user_config.base_url}") ++ wandb_config["settings"] = {"base_url": user_config.base_url} ++ ++ return wandb_config +diff --git a/services/automodel/src/nmp/automodel/tasks/training/model_utils/constants.py b/services/automodel/src/nmp/automodel/tasks/training/model_utils/constants.py +new file mode 100644 +index 000000000..c784016b7 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/model_utils/constants.py +@@ -0,0 +1,4 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++ADAPTER_FILES = ["adapter_config.json", "adapter_model.safetensors"] +diff --git a/services/automodel/src/nmp/automodel/tasks/training/model_utils/file_utils.py b/services/automodel/src/nmp/automodel/tasks/training/model_utils/file_utils.py +new file mode 100644 +index 000000000..579fb5559 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/model_utils/file_utils.py +@@ -0,0 +1,172 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++# ++# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual ++# property and proprietary rights in and to this material, related ++# documentation and any modifications thereto. Any use, reproduction, ++# disclosure or distribution of this material and related documentation ++# without an express license agreement from NVIDIA CORPORATION or ++# its affiliates is strictly prohibited. ++import logging ++from enum import Enum ++from pathlib import Path ++from typing import Dict, List, Optional ++ ++from nmp.automodel.tasks.training.model_utils.constants import ADAPTER_FILES ++ ++ ++class TargetCheckpointType(str, Enum): ++ """Target checkpoint format types for model conversion.""" ++ ++ NEMO = "NEMO" ++ HF = "HF" ++ HF_LORA = "HF_LORA" ++ ++ ++logger = logging.getLogger(__name__) ++ ++ ++def get_flat_files_list(parent_dir: str) -> List[str]: ++ """ ++ Get a list of files in a directory ++ """ ++ parent_path = Path(parent_dir).resolve() ++ if not parent_path.exists(): ++ raise ValueError(f"Path {parent_dir} does not exist") ++ if not parent_path.is_dir(): ++ raise ValueError(f"Path {parent_dir} is not a directory") ++ ++ return [str(path) for path in parent_path.rglob("*") if path.is_file()] ++ ++ ++def is_adapter_file_present(files: List[str]) -> bool: ++ """ ++ Check if the any file is a LoRA adapter file ++ """ ++ for file in files: ++ if not file: ++ continue ++ if any(adapter_file in file.lower() for adapter_file in ADAPTER_FILES): ++ return True ++ return False ++ ++ ++def check_directory_structure(path: Path | str, target: Dict[str, Optional[Dict]]) -> bool: ++ if isinstance(path, str): ++ path = Path(path) ++ ++ if not path.is_dir(): ++ logger.error(f"Provided path '{path}' is not a directory") ++ return False ++ ++ try: ++ got_files = {f.name for f in path.iterdir()} ++ except OSError: ++ logger.exception("Cannot read directory '%s'", path) ++ return False ++ ++ expected_files = set(target.keys()) ++ missing = expected_files - got_files ++ if missing: ++ logger.debug(f"Mismatch in '{path}': Missing items -> {missing}") ++ return False ++ ++ for name, _target in target.items(): ++ current_path = path / name ++ if isinstance(_target, dict): ++ # this is a directory ++ if not current_path.is_dir(): ++ return False ++ if not check_directory_structure(current_path, _target): ++ return False ++ elif _target is None: ++ if not current_path.is_file(): ++ logger.debug(f"Mismatch: '{current_path}' is expected to be a file but is a directory.") ++ return False ++ return True ++ ++ ++def is_nemo_model_directory(model_path: Path | str) -> bool: ++ nemo_structure = { ++ "context": {"nemo_tokenizer": {}, "model.yaml": None}, ++ "weights": {"metadata.json": None}, ++ } ++ return check_directory_structure(model_path, nemo_structure) ++ ++ ++def is_huggingface_model_directory(model_path: Path | str) -> bool: ++ """ ++ Checks if a directory contains the necessary files to be considered a ++ Hugging Face model directory. ++ ++ Args: ++ directory_path: The path to the directory to check. ++ ++ Returns: ++ True if the directory contains a config.json file and model weights, ++ False otherwise. ++ """ ++ if isinstance(model_path, str): ++ model_path = Path(model_path) ++ ++ # 1. Check for the mandatory config.json file ++ config_file = model_path / "config.json" ++ if not config_file.is_file(): ++ logger.debug(f"Missing {config_file}") ++ return False ++ ++ tokenizer_files = [ ++ model_path / "tokenizer.json", ++ model_path / "tokenizer_config.json", ++ model_path / "vocab.txt", ++ model_path / "merges.txt", ++ ] ++ if not any(tf.is_file() for tf in tokenizer_files): ++ logger.debug(f"Missing any tokenizer file: at least one of [{tokenizer_files}] is required") ++ return False ++ ++ # 2. Check for the presence of model weight files (either safetensors or pytorch bin) ++ safe_tensor_file = model_path / "model.safetensors" ++ has_safetensors = safe_tensor_file.is_file() or any(model_path.glob("model-*.safetensors")) ++ if has_safetensors: ++ return True ++ ++ logger.debug(f"Missing model weights files in the form of {safe_tensor_file} or {model_path}/model-*.safetensors") ++ pytorch_bin_file = model_path / "pytorch_model.bin" ++ has_pytorch_bin = pytorch_bin_file.is_file() or any(model_path.glob("pytorch_model-*.bin")) ++ if has_pytorch_bin: ++ return True ++ ++ logger.debug(f"Missing model weights files in the form of {pytorch_bin_file} or {model_path}/pytorch_model-*.bin") ++ return False ++ ++ ++def determine_llm_model_type(model_dir: str | Path) -> TargetCheckpointType | None: ++ """ ++ Determines whether a model directory contains a HuggingFace or NVIDIA NeMo model. ++ """ ++ model_path = Path(model_dir).resolve() ++ ++ if not model_path.exists() or not model_path.is_dir(): ++ logger.error(f"Provided path {model_path} is not a directory") ++ return None ++ ++ logger.debug(f"Checking model in {model_path} for LoRA adapter format indicators") ++ if is_adapter_file_present(get_flat_files_list(str(model_path))): ++ logger.info(f"Huggingface LoRA adapter format detected in {model_path}") ++ return TargetCheckpointType.HF_LORA ++ ++ logger.debug(f"Checking model in {model_path} for NeMo format indicators") ++ if is_nemo_model_directory(model_path): ++ logger.info(f"NeMo format detected in {model_path}") ++ return TargetCheckpointType.NEMO ++ ++ logger.debug(f"Checking model in {model_path} for HugginFace format indicators") ++ if is_huggingface_model_directory(model_path): ++ logger.info(f"HuggingFace format detected in {model_path}") ++ return TargetCheckpointType.HF ++ ++ logger.warning(f"model at {model_path} is an unknown checkpoint format") ++ logger.warning(f"File List: {get_flat_files_list(str(model_path))}") ++ ++ return None +diff --git a/services/automodel/src/nmp/automodel/tasks/training/progress.py b/services/automodel/src/nmp/automodel/tasks/training/progress.py +new file mode 100644 +index 000000000..252c6a36a +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/progress.py +@@ -0,0 +1,173 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Progress reporting for training tasks. ++ ++This module provides progress reporting to the Jobs service using ++the NeMo Platform SDK. The `JobsServiceProgressReporter` class ++handles high-level phase reporting for the training runner. ++ ++For training-specific metrics (loss, validation, checkpoints), see ++the `TrainingProgressCallback` in the automodel backend which composes ++this reporter and provides training-specific methods. ++""" ++ ++import logging ++import os ++from typing import Any ++ ++from nmp.common.sdk_factory import get_task_sdk ++from nmp.automodel.app.constants import SERVICE_NAME ++from nmp.automodel.app.jobs.context import NMPJobContext ++ ++logger = logging.getLogger(__name__) ++ ++ ++class JobsServiceProgressReporter: ++ """Reports high-level progress to the Jobs service. ++ ++ This class provides progress reporting for the training runner: ++ - configure_progress_tracking(max_steps, num_epochs) - Set bounds for percentage calculation ++ - report_running(phase, **details) - Report current phase (auto-calculates percentage_done) ++ - report_completed(message) - Report successful completion ++ - report_error(message) - Report failure ++ ++ For training backends that need to report detailed metrics, the ++ `update_task` method is exposed for direct use. See `TrainingProgressCallback` ++ in the automodel backend for an example. ++ """ ++ ++ def __init__(self, job_ctx: NMPJobContext): ++ """Initialize the progress reporter.""" ++ self._job_ctx = job_ctx ++ self._sdk = get_task_sdk(SERVICE_NAME) ++ self._is_main_rank = int(os.environ.get("RANK", "0")) == 0 ++ self._max_steps = 0 ++ self._num_epochs = 0 ++ ++ self._enabled = self._is_main_rank and all( ++ [self._job_ctx.job_id, self._job_ctx.step, self._job_ctx.normalized_task] ++ ) ++ ++ def configure_progress_tracking(self, max_steps: int, num_epochs: int) -> None: ++ """Configure progress tracking at the start of training. ++ ++ Args: ++ max_steps: Total number of training steps ++ num_epochs: Total number of epochs ++ """ ++ self._max_steps = max_steps ++ self._num_epochs = num_epochs ++ ++ def _calculate_percentage_done(self, step: int | None) -> int: ++ """Calculate percentage done based on current step and max_steps.""" ++ if step is None or self._max_steps <= 0: ++ return 0 ++ return int((step / self._max_steps) * 100) ++ ++ def update_task( ++ self, ++ status: str = "active", ++ status_details: dict[str, Any] | None = None, ++ error_details: dict[str, Any] | None = None, ++ ) -> None: ++ """Update task status via SDK. ++ ++ This is the low-level method exposed for composition by training ++ callbacks that need to report detailed metrics. ++ ++ Args: ++ status: Task status ("active", "completed", "error") ++ status_details: Details about the current status ++ error_details: Error information (for status="error") ++ """ ++ if not self._enabled: ++ return ++ ++ # Only report from rank 0 in distributed training ++ if not self._is_main_rank: ++ return ++ ++ try: ++ self._sdk.jobs.tasks.create_or_update( ++ name=self._job_ctx.normalized_task, ++ workspace=self._job_ctx.workspace, ++ job=self._job_ctx.job_id, ++ step=self._job_ctx.step, ++ status=status, ++ status_details=status_details or {}, ++ error_details=error_details or {}, ++ ) ++ except Exception as e: ++ logger.warning(f"Failed to update task progress: {e}") ++ ++ def fetch_current_metrics(self) -> dict[str, list[dict[str, float | int]]]: ++ """Fetch accumulated metrics from the server for the current task. ++ ++ Used to seed metric accumulators on startup so that metrics ++ survive pause/resume cycles. Returns empty lists on failure ++ or if no prior metrics exist. ++ """ ++ if not self._enabled: ++ return {"train_loss": [], "val_loss": []} ++ ++ try: ++ task = self._sdk.jobs.tasks.retrieve( ++ name=self._job_ctx.normalized_task, ++ workspace=self._job_ctx.workspace, ++ job=self._job_ctx.job_id, ++ step=self._job_ctx.step, ++ ) ++ metrics = (task.status_details or {}).get("metrics", {}) ++ return { ++ "train_loss": metrics.get("train_loss", []), ++ "val_loss": metrics.get("val_loss", []), ++ } ++ except Exception as e: ++ logger.info(f"No prior metrics to seed (expected on first run): {e}") ++ return {"train_loss": [], "val_loss": []} ++ ++ # --- High-level runner methods --- ++ ++ def report_running(self, phase: str, **details: Any) -> None: ++ """Report that a phase is running. ++ ++ If 'step' is provided and training schedule is set (via configure_progress_tracking), ++ percentage_done is automatically calculated unless explicitly provided. ++ ++ Args: ++ phase: The current phase (e.g., "compiling_config", "training") ++ **details: Additional context (e.g., step, epoch, loss, backend="automodel") ++ """ ++ # Auto-calculate percentage_done if step is provided and not already set ++ if "step" in details and "percentage_done" not in details and self._max_steps > 0: ++ details["percentage_done"] = self._calculate_percentage_done(details["step"]) ++ ++ status_details = {"phase": phase, **details} ++ self.update_task(status="active", status_details=status_details) ++ ++ def report_completed(self, message: str = "Completed") -> None: ++ """Report task completed successfully. ++ ++ Args: ++ message: Completion message ++ """ ++ self.update_task(status="completed", status_details={"message": message, "phase": "completed"}) ++ ++ def report_error(self, error: str | dict[str, Any]) -> None: ++ """Report task error. ++ ++ Args: ++ error: Error message (str) or error details dict with 'message', 'type', 'detail' keys. ++ The dict format is typically from create_error_details() in the errors module. ++ """ ++ if isinstance(error, str): ++ error_details = {"message": error} ++ else: ++ error_details = error ++ self.update_task(status="error", error_details=error_details) ++ ++ def close(self) -> None: ++ """Clean up SDK resources.""" ++ self._sdk.close() +diff --git a/services/automodel/src/nmp/automodel/tasks/training/protocol.py b/services/automodel/src/nmp/automodel/tasks/training/protocol.py +new file mode 100644 +index 000000000..59045ca59 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/protocol.py +@@ -0,0 +1,14 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from dataclasses import dataclass ++from pathlib import Path ++from typing import Any ++ ++ ++@dataclass ++class LibraryConfig: ++ """nemo-automodel recipe config written by the training runner.""" ++ ++ config_dict: dict[str, Any] ++ config_path: Path +diff --git a/services/automodel/src/nmp/automodel/tasks/training/runner.py b/services/automodel/src/nmp/automodel/tasks/training/runner.py +new file mode 100644 +index 000000000..979a95321 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/runner.py +@@ -0,0 +1,189 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Training runner with distributed coordination support. ++ ++Orchestrates Automodel training in single-node and multi-node environments, ++using file-based barriers for cross-pod synchronization. ++""" ++ ++import json ++import logging ++import random ++import time ++from enum import Enum ++from pathlib import Path ++from types import TracebackType ++ ++import yaml ++from nmp.automodel.app.constants import DEFAULT_TRAINING_RESULT_FILE_NAME ++from nmp.automodel.app.jobs.context import NMPJobContext ++ ++from .backends.backend import AUTOMODEL_CONFIG_FILENAME, AutomodelBackend ++from .distributed import DistributedContext ++from .errors.converter import create_error_details ++from .progress import JobsServiceProgressReporter ++from .protocol import LibraryConfig ++from .schemas import ( ++ GPUInfo, ++ TrainingMetrics, ++ TrainingResult, ++ TrainingStepConfig, ++) ++from .utils import get_gpu_info ++ ++ ++# Custom YAML representer to serialize Enum values as their string values ++def _enum_representer(dumper: yaml.Dumper, data: Enum) -> yaml.Node: ++ """Represent Enum as its value (string) rather than a Python object tag.""" ++ return dumper.represent_str(str(data.value)) ++ ++ ++yaml.add_representer(Enum, _enum_representer) ++yaml.add_multi_representer(Enum, _enum_representer) ++ ++logger = logging.getLogger(__name__) ++ ++BARRIER_CONFIG_READY = "config_ready" ++BARRIER_TRAINING_COMPLETE = "training_complete" ++ ++ ++class TrainingRunner: ++ """ ++ Orchestrates Automodel training across single-node and multi-node environments. ++ ++ Usage: ++ with TrainingRunner() as runner: ++ result = runner.run() ++ """ ++ ++ def __init__(self, backend: AutomodelBackend | None = None) -> None: ++ self._job_ctx = NMPJobContext.from_env() ++ self._config = self._load_config(self._job_ctx.config_path) ++ self._progress = JobsServiceProgressReporter(self._job_ctx) ++ self._dist_ctx = DistributedContext.from_env(self._get_barrier_dir()) ++ self._backend = backend or AutomodelBackend(self._job_ctx) ++ self._workspace_path = Path(self._config.workspace_path) ++ self._output_path = Path(self._config.output_path) ++ ++ def __enter__(self) -> "TrainingRunner": ++ return self ++ ++ def __exit__( ++ self, ++ exc_type: type[BaseException] | None, ++ exc_val: BaseException | None, ++ exc_tb: TracebackType | None, ++ ) -> None: ++ self.close() ++ ++ def close(self) -> None: ++ self._progress.close() ++ ++ def run(self) -> TrainingResult: ++ random.seed(self._config.seed) ++ logger.info(f"Global random seed set to {self._config.seed}") ++ ++ start_time = time.time() ++ gpu_info = get_gpu_info() ++ result = TrainingResult(success=False, error_message="No result") ++ ++ try: ++ library_config = self._compile_config_phase() ++ metrics = self._training_phase(library_config) ++ self._dist_ctx.sync_point(BARRIER_TRAINING_COMPLETE) ++ result = self._postprocess_phase(gpu_info, metrics, start_time, library_config) ++ ++ except Exception as e: ++ logger.exception(f"Training failed: {e}") ++ error_details = create_error_details(e) ++ result = TrainingResult( ++ success=False, ++ error_message=error_details.get("message", str(e)), ++ gpu_info=gpu_info, ++ training_duration_seconds=time.time() - start_time, ++ ) ++ if self._dist_ctx.is_coordinator: ++ self._progress.report_error(error_details) ++ finally: ++ self._write_result(result) ++ return result ++ ++ def _get_barrier_dir(self) -> Path: ++ return self._job_ctx.storage_path / self._job_ctx.attempt_id / "distributed" / "barriers" ++ ++ def _load_config(self, config_path: Path) -> TrainingStepConfig: ++ with open(config_path) as f: ++ return TrainingStepConfig.model_validate(json.load(f)) ++ ++ def _get_library_config_path(self) -> Path: ++ return self._workspace_path / AUTOMODEL_CONFIG_FILENAME ++ ++ def _compile_config_phase(self) -> LibraryConfig: ++ config_path = self._get_library_config_path() ++ ++ if self._dist_ctx.is_coordinator: ++ self._progress.report_running("compiling_config") ++ config_dict = self._backend.compile_config(self._config, self._workspace_path) ++ config_path.parent.mkdir(parents=True, exist_ok=True) ++ with open(config_path, "w") as f: ++ yaml.dump(config_dict, f, default_flow_style=False) ++ logger.info(f"Library config written to: {config_path}") ++ self._dist_ctx.signal(BARRIER_CONFIG_READY) ++ return LibraryConfig(config_dict=config_dict, config_path=config_path) ++ ++ self._dist_ctx.wait_for_coordinator(BARRIER_CONFIG_READY) ++ return self._load_library_config(config_path) ++ ++ def _load_library_config(self, config_path: Path) -> LibraryConfig: ++ if not config_path.exists(): ++ raise FileNotFoundError( ++ f"Library config not found at {config_path}. Coordinator may not have written it yet." ++ ) ++ with open(config_path) as f: ++ config_dict = yaml.safe_load(f) ++ logger.info(f"Loaded library config from: {config_path}") ++ return LibraryConfig(config_dict=config_dict, config_path=config_path) ++ ++ def _training_phase(self, library_config: LibraryConfig) -> TrainingMetrics: ++ return self._backend.execute_training(self._config, library_config, self._progress) ++ ++ def _postprocess_phase( ++ self, ++ gpu_info: GPUInfo | None, ++ metrics: TrainingMetrics, ++ start_time: float, ++ library_config: LibraryConfig, ++ ) -> TrainingResult: ++ if not self._dist_ctx.is_coordinator: ++ return TrainingResult( ++ success=True, ++ gpu_info=gpu_info, ++ training_duration_seconds=time.time() - start_time, ++ ) ++ ++ self._progress.report_running("processing_checkpoint") ++ checkpoint_path = self._backend.find_best_checkpoint(self._workspace_path, self._config, library_config) ++ checkpoint_info = self._backend.process_checkpoint( ++ checkpoint_path, self._output_path, self._config, library_config ++ ) ++ ++ result = TrainingResult( ++ success=True, ++ checkpoint=checkpoint_info, ++ gpu_info=gpu_info, ++ metrics=metrics, ++ training_duration_seconds=time.time() - start_time, ++ ) ++ self._progress.report_completed("Training completed") ++ return result ++ ++ def _write_result(self, result: TrainingResult) -> None: ++ if not self._dist_ctx.is_coordinator: ++ return ++ result_path = self._workspace_path / DEFAULT_TRAINING_RESULT_FILE_NAME ++ result_path.parent.mkdir(parents=True, exist_ok=True) ++ with open(result_path, "w") as f: ++ f.write(result.model_dump_json(indent=2)) ++ logger.info(f"Result written to: {result_path}") +diff --git a/services/automodel/src/nmp/automodel/tasks/training/schemas.py b/services/automodel/src/nmp/automodel/tasks/training/schemas.py +new file mode 100644 +index 000000000..4a5d493ee +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/schemas.py +@@ -0,0 +1,42 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from nmp.automodel.app.jobs.training.schemas import ( ++ CheckpointInfo, ++ DistillationConfig, ++ EmbeddingConfig, ++ GPUInfo, ++ LoRAConfig, ++ MLflowConfig, ++ ModelConfig, ++ OptimizerType, ++ TrainingMetrics, ++ TrainingResult, ++ TrainingStepConfig, ++ WandBConfig, ++) ++from nmp.automodel.entities.values import ( ++ CheckpointFormat, ++ FinetuningType, ++ Precision, ++ TrainingType, ++) ++ ++__all__ = [ ++ "CheckpointFormat", ++ "FinetuningType", ++ "Precision", ++ "TrainingType", ++ "CheckpointInfo", ++ "DistillationConfig", ++ "EmbeddingConfig", ++ "GPUInfo", ++ "LoRAConfig", ++ "MLflowConfig", ++ "ModelConfig", ++ "OptimizerType", ++ "TrainingMetrics", ++ "TrainingResult", ++ "TrainingStepConfig", ++ "WandBConfig", ++] +diff --git a/services/automodel/src/nmp/automodel/tasks/training/sequence_packing.py b/services/automodel/src/nmp/automodel/tasks/training/sequence_packing.py +new file mode 100644 +index 000000000..ae1948ee1 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/sequence_packing.py +@@ -0,0 +1,349 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++""" ++Sequence packing utilities for Automodel training. ++ ++Sequence packing combines multiple shorter sequences into a single packed sequence ++to improve GPU utilization during training. This module provides: ++ ++1. Optimal pack size calculation based on dataset statistics ++2. Dataset sequence length estimation via sampling ++ ++The algorithm balances packing efficiency with training stability by: ++- Calculating a target packing factor from global batch size and GPU count ++- Ensuring pack size is at least the max sequence length in the dataset ++- Clamping to the model's maximum sequence length ++ ++Usage with Automodel: ++ The `packed_sequence_size` calculated here should be passed to Automodel's ++ config under `packed_sequence.packed_sequence_size`. Automodel automatically ++ handles step calculation based on the packed dataset size - no manual ++ adjustment of max_steps or global_batch_size is needed. ++ ++Reference: ++ - NeMo docs: https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html ++ - Automodel docs: https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/guides/llm/dataset.md#packed-sequence-support-in-nemo-automodel ++""" ++ ++import json ++import logging ++import math ++import random ++from dataclasses import dataclass ++from pathlib import Path ++ ++from nmp.automodel.app.constants import DEFAULT_SEED ++from nmp.automodel.tasks.training.schemas import TrainingStepConfig ++ ++logger = logging.getLogger(__name__) ++ ++ ++@dataclass ++class PackingEstimate: ++ """Statistics from dataset sampling for sequence packing configuration. ++ ++ This dataclass holds the results of sampling a dataset to estimate ++ sequence length statistics, which are used to calculate optimal ++ pack sizes for sequence packing. ++ ++ Attributes: ++ pack_size: Recommended pack size in tokens for Automodel's ++ `packed_sequence.packed_sequence_size` config ++ avg_seq_length: Average sequence length in the sampled data ++ max_seq_length: Maximum sequence length in the sampled data ++ packing_factor: Estimated number of sequences per pack ++ (pack_size / avg_seq_length) ++ samples_analyzed: Number of samples successfully tokenized ++ """ ++ ++ pack_size: int ++ avg_seq_length: int ++ max_seq_length: int ++ packing_factor: float ++ samples_analyzed: int ++ ++ ++def _ceil_even(num: int | float) -> int: ++ """Round up to the nearest even number. ++ ++ NeMo/Automodel prefer even sequence lengths for efficiency with ++ tensor parallelism and other optimizations. ++ ++ Examples: ++ >>> _ceil_even(3) ++ 4 ++ >>> _ceil_even(4) ++ 4 ++ >>> _ceil_even(5.5) ++ 6 ++ """ ++ return int(math.ceil(num / 2) * 2) ++ ++ ++def calculate_optimal_pack_size( ++ config: TrainingStepConfig, ++ dataset_avg_seq_length: int | None = None, ++ dataset_max_seq_length: int | None = None, ++) -> int: ++ """ ++ Calculate optimal pack size for sequence packing. ++ ++ This algorithm balances packing efficiency with training stability: ++ 1. Target packing_factor = global_batch_size / total_gpus ++ 2. target_pack_size = avg_seq_length * packing_factor (but at least max_seq_length) ++ 3. Clamp to model's max_seq_length ++ ++ The packing factor determines how many sequences fit into one packed sequence. ++ A higher packing factor means better GPU utilization but may affect convergence ++ if pack sizes become very large. ++ ++ If dataset statistics are not provided, uses model's max_seq_length as a ++ conservative default (which effectively disables the optimization). ++ ++ Args: ++ config: Training configuration containing parallelism, batch, and model settings ++ dataset_avg_seq_length: Average sequence length in the dataset (after tokenization) ++ dataset_max_seq_length: Maximum sequence length in the dataset ++ ++ Returns: ++ Optimal pack size in tokens ++ ++ Example: ++ For a setup with: ++ - global_batch_size = 32 ++ - 8 GPUs (num_nodes=1, num_gpus_per_node=8) ++ - avg_seq_length = 512 ++ - max_seq_length = 1024 ++ - model.max_seq_length = 4096 ++ ++ Calculation: ++ - packing_factor = 32 / 8 = 4 ++ - target_pack_size = ceil_even(512 * 4) = 2048 ++ - final = max(2048, 1024) = 2048 (clamped to 4096) = 2048 ++ """ ++ parallelism = config.parallelism ++ total_gpus = parallelism.num_nodes * parallelism.num_gpus_per_node ++ gbs = config.batch.global_batch_size ++ model_max_seq = config.model.max_seq_length ++ ++ # If no dataset stats provided, use model's max_seq_length (conservative) ++ if dataset_avg_seq_length is None or dataset_max_seq_length is None: ++ logger.info(f"No dataset statistics provided, using model max_seq_length: {model_max_seq}") ++ return model_max_seq ++ ++ # Calculate target packing factor (how many sequences can fit in one pack) ++ # This keeps the effective batch size close to the original gbs ++ target_packing_factor = max(gbs // total_gpus, 1) ++ ++ # Calculate pack size based on average sequence length ++ # Round to nearest even number for efficiency ++ target_pack_size = _ceil_even(round(dataset_avg_seq_length * target_packing_factor)) ++ ++ # Ensure pack size is at least the max sequence length in the dataset ++ # (so no sequence gets truncated due to packing) ++ target_pack_size = max(target_pack_size, dataset_max_seq_length) ++ ++ # Clamp to model's maximum sequence length ++ optimal_pack_size = min(target_pack_size, model_max_seq) ++ ++ logger.info( ++ f"Calculated optimal pack size: {optimal_pack_size} " ++ f"(avg_seq={dataset_avg_seq_length}, max_seq={dataset_max_seq_length}, " ++ f"packing_factor={target_packing_factor})" ++ ) ++ ++ return optimal_pack_size ++ ++ ++def estimate_dataset_sequence_lengths( ++ config: TrainingStepConfig, ++ train_file: Path | None = None, ++ max_samples: int = 1000, ++ seed: int = DEFAULT_SEED, ++ trust_remote_code: bool = False, ++) -> PackingEstimate | None: ++ """ ++ Estimate dataset sequence lengths by sampling and calculate optimal pack size. ++ ++ This is a lightweight alternative to full tokenization that uses reservoir ++ sampling to randomly select a subset of the dataset for sequence length ++ estimation. The sampling is unbiased regardless of dataset ordering. ++ ++ The function: ++ 1. Loads the model's tokenizer ++ 2. Randomly samples up to `max_samples` examples using reservoir sampling ++ 3. Tokenizes each example (using apply_chat_template for chat format) ++ 4. Calculates optimal pack size based on the statistics ++ ++ NOTE: Sampling may underestimate max_seq_length for datasets with rare ++ long sequences. The pack size calculation accounts for this by clamping ++ to the model's max_seq_length. ++ ++ Args: ++ config: Training configuration with dataset and model paths ++ train_file: Path to the prepared training JSONL file. When provided ++ this file is used directly; otherwise falls back to ++ ``config.dataset.path / "train.jsonl"``. ++ max_samples: Maximum number of samples to analyze (default: 1000) ++ seed: Random seed for reproducible sampling (default: 1111) ++ trust_remote_code: Whether to trust remote code (default: False) ++ ++ Returns: ++ PackingEstimate with pack_size and statistics, or None if estimation fails ++ """ ++ ++ try: ++ if train_file is None: ++ train_file = Path(config.dataset.path) / "train.jsonl" ++ ++ if not train_file.exists(): ++ logger.warning(f"Training file not found: {train_file}") ++ return None ++ ++ # Import here to avoid ModuleNotFoundError in environments where ++ # transformers is not installed (e.g., during test collection) ++ from transformers import AutoTokenizer ++ ++ # Load tokenizer from model ++ tokenizer = AutoTokenizer.from_pretrained( ++ config.model.path, ++ trust_remote_code=trust_remote_code, ++ ) ++ ++ random.seed(seed) ++ ++ # Sample examples to estimate lengths ++ lengths = _sample_sequence_lengths(train_file, tokenizer, max_samples) ++ ++ if not lengths: ++ logger.warning("Could not estimate sequence lengths from dataset") ++ return None ++ ++ avg_length = _ceil_even(int(sum(lengths) / len(lengths))) ++ max_length = _ceil_even(max(lengths)) ++ ++ # Calculate optimal pack size ++ pack_size = calculate_optimal_pack_size(config, avg_length, max_length) ++ packing_factor = pack_size / avg_length if avg_length > 0 else 1.0 ++ ++ estimate = PackingEstimate( ++ pack_size=pack_size, ++ avg_seq_length=avg_length, ++ max_seq_length=max_length, ++ packing_factor=round(packing_factor, 2), ++ samples_analyzed=len(lengths), ++ ) ++ ++ logger.info( ++ f"Packing estimate from {len(lengths)} samples: " ++ f"pack_size={pack_size}, avg_seq={avg_length}, max_seq={max_length}, " ++ f"packing_factor={estimate.packing_factor:.2f}" ++ ) ++ ++ return estimate ++ ++ except Exception as e: ++ logger.warning(f"Failed to estimate sequence lengths: {e}") ++ return None ++ ++ ++def _sample_sequence_lengths( ++ train_file: Path, ++ tokenizer, ++ max_samples: int, ++) -> list[int]: ++ """ ++ Sample sequences from a JSONL file and return their tokenized lengths. ++ ++ Uses reservoir sampling for unbiased random selection, then tokenizes ++ each sample to measure its length. For chat format, uses apply_chat_template ++ to get accurate lengths including role tokens and formatting. ++ ++ Args: ++ train_file: Path to training JSONL file ++ tokenizer: HuggingFace tokenizer ++ max_samples: Maximum samples to return ++ ++ Returns: ++ List of sequence lengths (in tokens) ++ """ ++ # Reservoir sampling to select samples ++ samples: list[str] = [] ++ with open(train_file, "r") as f: ++ for i, line in enumerate(f): ++ if i < max_samples: ++ samples.append(line) ++ else: ++ j = random.randint(0, i) ++ if j < max_samples: ++ samples[j] = line ++ ++ # Tokenize samples to get lengths ++ lengths = [] ++ for line in samples: ++ try: ++ obj = json.loads(line) ++ length = _get_sample_token_length(obj, tokenizer) ++ if length is not None: ++ lengths.append(length) ++ except Exception: ++ # Skip malformed lines ++ continue ++ ++ return lengths ++ ++ ++def _get_sample_token_length(obj: dict, tokenizer) -> int | None: ++ """ ++ Get the tokenized length of a dataset sample. ++ ++ For chat format, uses apply_chat_template to accurately measure length ++ including role tokens, special tokens, and formatting. Falls back to ++ simple text concatenation for other formats or if chat template fails. ++ ++ Args: ++ obj: Parsed JSON object from dataset ++ tokenizer: HuggingFace tokenizer ++ ++ Returns: ++ Token count, or None if sample is empty/invalid ++ """ ++ # Chat format: use apply_chat_template for accurate length ++ if "messages" in obj: ++ messages = obj["messages"] ++ if messages and hasattr(tokenizer, "apply_chat_template"): ++ try: ++ tokens = tokenizer.apply_chat_template( ++ messages, ++ add_generation_prompt=False, ++ tokenize=True, ++ ) ++ return len(tokens) ++ except Exception: ++ # Fall back to text extraction if chat template fails ++ pass ++ ++ # Fallback: concatenate role + content ++ parts = [] ++ for m in messages: ++ if isinstance(m, dict): ++ role = m.get("role", "") ++ content = m.get("content", "") ++ if role or content: ++ parts.append(f"{role}: {content}") ++ text = "\n".join(parts) ++ if text: ++ return len(tokenizer.encode(text, add_special_tokens=True)) ++ return None ++ ++ # SFT format: prompt + completion ++ if "prompt" in obj and "completion" in obj: ++ text = str(obj["prompt"]) + " " + str(obj["completion"]) ++ return len(tokenizer.encode(text, add_special_tokens=True)) ++ ++ # Generic: concatenate all string values ++ text = " ".join(str(v) for v in obj.values() if isinstance(v, str)) ++ if text: ++ return len(tokenizer.encode(text, add_special_tokens=True)) ++ return None +diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.1-instruct.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.1-instruct.jinja +new file mode 100644 +index 000000000..e074cba57 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.1-instruct.jinja +@@ -0,0 +1,61 @@ ++{{- bos_token }} ++{%- if not date_string is defined %} ++ {%- if strftime_now is defined %} ++ {%- set date_string = strftime_now("%d %b %Y") %} ++ {%- else %} ++ {%- set date_string = "26 Jul 2024" %} ++ {%- endif %} ++{%- endif %} ++{%- set loop_messages = messages %} ++{%- if tools is not none and tool_choice is not none %} ++ {{- '<|start_header_id|>system<|end_header_id|>\n\n' }} ++ {{- "Environment: ipython\n\n" }} ++ {{- "Cutting Knowledge Date: December 2023\n" }} ++ {{- "Today Date: " + date_string + "\n\n" }} ++ {{- "You are a helpful assistant.\n" }} ++ {{- '<|eot_id|>' }} ++ {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} ++ {{- 'You have access to the following functions to supplement your existing knowledge:\n\n' }} ++ {%- for t in tools %} ++ {%- set tname = t.function.name %} ++ {%- set tdesc = t.function.description %} ++ {%- set tparams = t.function.parameters | tojson %} ++ {{- "Use the function '" + tname + "' to '" + tdesc + "':\n" }} ++ {{- '{"name": "' + tname + '", "description": "' + tdesc + '", "parameters": ' + tparams + '}\n\n' }} ++ {%- endfor %} ++ {{- 'Think very carefully before calling functions.\n' }} ++ {{- 'Only call them if they are relevant to the prompt.\n' }} ++ {{- 'If you choose to call a function ONLY reply in the following format with no natural language surrounding it:\n\n' }} ++ {{- '{"example_name": "example_value"}\n\n' }} ++ {{- 'Reminder:\n' }} ++ {{- '- Function calls MUST follow the specified format, start with \n' }} ++ {{- '- Required parameters MUST be specified\n' }} ++ {{- '- Only call one function at a time\n' }} ++ {{- '- Put the entire function call reply on one line\n' }} ++ {{- '- Do not call functions if they are not relevant to the prompt' }} ++ {{- '<|eot_id|>' }} ++{%- endif %} ++{%- for message in loop_messages %} ++ {%- if message['role'] in ['ipython', 'tool'] %} ++ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} ++ {{- "[stdout]" + message['content'] | trim + "[/stdout]\n<|eot_id|>" }} ++ {%- elif message['role'] == 'assistant'%} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++ {%- if message.get('tool_calls') is not none %} ++ {%- set tool_call = message['tool_calls'][0] %} ++ {%- generation %} ++ {{- '<|python_tag|>' + tool_call.function.arguments | tojson + '\n<|eot_id|>' }} ++ {%- endgeneration %} ++ {%- else %} ++ {%- generation %} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endgeneration %} ++ {%- endif %} ++ {%- else %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endif %} ++{%- endfor %} ++{%- if add_generation_prompt %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++{%- endif %} +diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.2-instruct.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.2-instruct.jinja +new file mode 100644 +index 000000000..e074cba57 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.2-instruct.jinja +@@ -0,0 +1,61 @@ ++{{- bos_token }} ++{%- if not date_string is defined %} ++ {%- if strftime_now is defined %} ++ {%- set date_string = strftime_now("%d %b %Y") %} ++ {%- else %} ++ {%- set date_string = "26 Jul 2024" %} ++ {%- endif %} ++{%- endif %} ++{%- set loop_messages = messages %} ++{%- if tools is not none and tool_choice is not none %} ++ {{- '<|start_header_id|>system<|end_header_id|>\n\n' }} ++ {{- "Environment: ipython\n\n" }} ++ {{- "Cutting Knowledge Date: December 2023\n" }} ++ {{- "Today Date: " + date_string + "\n\n" }} ++ {{- "You are a helpful assistant.\n" }} ++ {{- '<|eot_id|>' }} ++ {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} ++ {{- 'You have access to the following functions to supplement your existing knowledge:\n\n' }} ++ {%- for t in tools %} ++ {%- set tname = t.function.name %} ++ {%- set tdesc = t.function.description %} ++ {%- set tparams = t.function.parameters | tojson %} ++ {{- "Use the function '" + tname + "' to '" + tdesc + "':\n" }} ++ {{- '{"name": "' + tname + '", "description": "' + tdesc + '", "parameters": ' + tparams + '}\n\n' }} ++ {%- endfor %} ++ {{- 'Think very carefully before calling functions.\n' }} ++ {{- 'Only call them if they are relevant to the prompt.\n' }} ++ {{- 'If you choose to call a function ONLY reply in the following format with no natural language surrounding it:\n\n' }} ++ {{- '{"example_name": "example_value"}\n\n' }} ++ {{- 'Reminder:\n' }} ++ {{- '- Function calls MUST follow the specified format, start with \n' }} ++ {{- '- Required parameters MUST be specified\n' }} ++ {{- '- Only call one function at a time\n' }} ++ {{- '- Put the entire function call reply on one line\n' }} ++ {{- '- Do not call functions if they are not relevant to the prompt' }} ++ {{- '<|eot_id|>' }} ++{%- endif %} ++{%- for message in loop_messages %} ++ {%- if message['role'] in ['ipython', 'tool'] %} ++ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} ++ {{- "[stdout]" + message['content'] | trim + "[/stdout]\n<|eot_id|>" }} ++ {%- elif message['role'] == 'assistant'%} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++ {%- if message.get('tool_calls') is not none %} ++ {%- set tool_call = message['tool_calls'][0] %} ++ {%- generation %} ++ {{- '<|python_tag|>' + tool_call.function.arguments | tojson + '\n<|eot_id|>' }} ++ {%- endgeneration %} ++ {%- else %} ++ {%- generation %} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endgeneration %} ++ {%- endif %} ++ {%- else %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endif %} ++{%- endfor %} ++{%- if add_generation_prompt %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++{%- endif %} +diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.3-instruct.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.3-instruct.jinja +new file mode 100644 +index 000000000..a0ba6017e +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.3-instruct.jinja +@@ -0,0 +1,61 @@ ++{{- bos_token }} ++{%- if not date_string is defined %} ++ {%- if strftime_now is defined %} ++ {%- set date_string = strftime_now("%d %b %Y") %} ++ {%- else %} ++ {%- set date_string = "26 Jul 2024" %} ++ {%- endif %} ++{%- endif %} ++{%- set loop_messages = messages %} ++{%- if tools is not none and tool_choice is not none %} ++ {{- '<|start_header_id|>system<|end_header_id|>\n\n' }} ++ {{- "Environment: ipython\n\n" }} ++ {{- "Cutting Knowledge Date: December 2023\n" }} ++ {{- "Today Date: " + date_string + "\n\n" }} ++ {{- "You are a helpful assistant.\n" }} ++ {{- '<|eot_id|>' }} ++ {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} ++ {{- 'You have access to the following functions to supplement your existing knowledge:\n\n' }} ++ {%- for t in tools %} ++ {%- set tname = t.function.name %} ++ {%- set tdesc = t.function.description %} ++ {%- set tparams = t.function.parameters | tojson %} ++ {{- "Use the function '" + tname + "' to '" + tdesc + "':\n" }} ++ {{- '{"name": "' + tname + '", "description": "' + tdesc + '", "parameters": ' + tparams + '}\n\n' }} ++ {%- endfor %} ++ {{- 'Think very carefully before calling functions.\n' }} ++ {{- 'Only call them if they are relevant to the prompt.\n' }} ++ {{- 'If you choose to call a function ONLY reply in the following format with no natural language surrounding it:\n\n' }} ++ {{- '{"example_name": "example_value"}\n\n' }} ++ {{- 'Reminder:\n' }} ++ {{- '- Function calls MUST follow the specified format, start with \n' }} ++ {{- '- Required parameters MUST be specified\n' }} ++ {{- '- Only call one function at a time\n' }} ++ {{- '- Put the entire function call reply on one line\n' }} ++ {{- '- Do not call functions if they are not relevant to the prompt' }} ++ {{- '<|eot_id|>' }} ++{%- endif %} ++{%- for message in loop_messages %} ++ {%- if message['role'] in ['ipython', 'tool'] %} ++ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} ++ {{- "[stdout]" + message['content'] | trim + "[/stdout]\n<|eot_id|>" }} ++ {%- elif message['role'] == 'assistant'%} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++ {%- if message.get('tool_calls') is not none %} ++ {%- set tool_call = message['tool_calls'][0] %} ++ {%- generation %} ++ {{- '' + tool_call.function.arguments | tojson + '\n<|eot_id|>' }} ++ {%- endgeneration %} ++ {%- else %} ++ {%- generation %} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endgeneration %} ++ {%- endif %} ++ {%- else %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endif %} ++{%- endfor %} ++{%- if add_generation_prompt %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++{%- endif %} +diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.1.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.1.jinja +new file mode 100644 +index 000000000..00cfd85e4 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.1.jinja +@@ -0,0 +1,51 @@ ++{%- if messages[0]['role'] == 'system' %} ++ {%- set system_message = messages[0]['content'] | trim %} ++ {%- set messages = messages[1:] %} ++{%- else %} ++ {%- set system_message = '' %} ++{%- endif %} ++{%- if tools is not none %} ++ {{- '<|begin_of_text|><|start_header_id|>system<|end_header_id|>' + '\n\n' + system_message }} ++ {{- '\n\n' if system_message else '' }} ++ {{- '[' }} ++ {%- for t in tools %} ++ {{- (t.function if t.function is defined else t) | tojson() }} ++ {{- ', ' if not loop.last else '' }} ++ {%- endfor %} ++ {{- ']' }} ++ {{- '<|eot_id|>' }} ++{%- else %} ++ {{- '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }} ++{%- endif %} ++{%- for message in messages %} ++ {%- if (message['role'] in ['user', 'tool']) != (loop.index0 % 2 == 0) %} ++ {{- raise_exception('Conversation roles must alternate between user/tool and assistant') }} ++ {%- elif message['role'] == 'user' %} ++ {{- '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }} ++ {%- elif message['role'] == 'tool' %} ++ {%- set tool_response = '[' + message['content'] | trim + ']' %} ++ {{- '<|start_header_id|>user<|end_header_id|>\n\n' + tool_response + '<|eot_id|>' }} ++ {%- elif message['role'] == 'assistant' and message.get('tool_calls') is not none %} ++ {%- set tool_calls = message['tool_calls'] %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++ {%- generation %} ++ {{- '['}} ++ {%- for tool_call in tool_calls %} ++ {{- '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson + '}' }} ++ {%- if not loop.last %} ++ {{- ', ' }} ++ {%- else %} ++ {{- ']<|eot_id|>' }} ++ {%- endif %} ++ {%- endfor %} ++ {%- endgeneration %} ++ {%- elif message['role'] == 'assistant' %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++ {%- generation %} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endgeneration %} ++ {%- endif %} ++{%- endfor %} ++{%- if add_generation_prompt %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++{%- endif %} +diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.3.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.3.jinja +new file mode 100644 +index 000000000..7530a8c87 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.3.jinja +@@ -0,0 +1,21 @@ ++{{- bos_token }} ++{%- if messages[0]['role'] == 'system' %} ++ {%- set system_message = messages[0]['content']|trim %} ++ {%- set messages = messages[1:] %} ++{%- else %} ++ {%- set system_message = '' %} ++{%- endif %} ++{{- '<|start_header_id|>system<|end_header_id|>\n\n' }} ++{{- system_message }} ++{{- '<|eot_id|>' }} ++{%- for message in messages %} ++ {%- if message['role'] == 'assistant' and '' in message['content'] %} ++ {%- set content = message['content'].split('')[-1].lstrip() %} ++ {%- else %} ++ {%- set content = message['content'] %} ++ {%- endif %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }} ++{%- endfor %} ++{%- if add_generation_prompt %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }} ++{%- endif %} +diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-super-3.3.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-super-3.3.jinja +new file mode 100644 +index 000000000..1deec2634 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-super-3.3.jinja +@@ -0,0 +1,82 @@ ++{{- bos_token }} ++{%- set ns = namespace(p='', has_tools=False) %} ++{%- if tools is not none and tool_choice is not none %} ++ {%- set ns.has_tools = True %} ++ {%- set ns.p = ns.p + 'You are an expert in composing functions. You are given a question and a set of possible functions. ' %} ++ {%- set ns.p = ns.p + 'Based on the question, you will need to make one or more function/tool calls to achieve the purpose. ' %} ++ {%- set ns.p = ns.p + 'If none of the function can be used, point it out. ' %} ++ {%- set ns.p = ns.p + 'If the given question lacks the parameters required by the function, also point it out. ' %} ++ {%- set ns.p = ns.p + 'You should only return the function call in tools call sections. ' %} ++ {%- set ns.p = ns.p + 'Here is a list of functions in JSON format that you can invoke.\n' %} ++ {%- set ns.p = ns.p + '[' %} ++ {%- for tool in tools %} ++ {%- set function = tool.function %} ++ {%- set keys = function.keys() | reject('equalto', 'return') | list %} ++ {%- set ns.p = ns.p + '{"type": "function", "function": {' %} ++ {%- for key in keys %} ++ {%- set val = function[key] %} ++ {%- if val is string %} ++ {%- set ns.p = ns.p + '"' + key + '": "' + val + '"' %} ++ {%- else %} ++ {%- set ns.p = ns.p + '"' + key + '": ' + val|tojson %} ++ {%- endif %} ++ {%- if not loop.last %} ++ {%- set ns.p = ns.p + ', ' %} ++ {%- endif %} ++ {%- endfor %} ++ {%- set ns.p = ns.p + '}}' %} ++ {%- if not loop.last %} ++ {%- set ns.p = ns.p + ', ' %} ++ {%- endif %} ++ {%- endfor %} ++ {%- set ns.p = ns.p + ']\n' %} ++ {%- set ns.p = ns.p + 'If you decide to invoke any of the function(s), put it in the JSON TOOL CALLING format of ' %} ++ {%- set ns.p = ns.p + '[{"name": "func_name1", "arguments": {"params_name1": "params_value1", "params_name2": "params_value2"}}, ' %} ++ {%- set ns.p = ns.p + '{"name": "func_name2", "arguments": {"params_name1": "params_value1", "params_name2": "params_value2"}}] ' %} ++ {%- set ns.p = ns.p + '\n' %} ++ {%- set ns.p = ns.p + 'You SHOULD NOT include any other information in the response. REMEMBER TO USE JSON TOOL CALLING FORMAT.\n\n' %} ++{%- endif %} ++{%- for message in messages %} ++ {%- if message['role'] == 'user' %} ++ {%- if ns.has_tools %} ++ {%- if add_generation_prompt and loop.index0 == ((messages | length) - 1) %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + ns.p + (message['content'] | trim) + '<|eot_id|>' }} ++ {%- elif not add_generation_prompt and loop.index0 == ((messages | length) - 2) %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + ns.p + (message['content'] | trim) + '<|eot_id|>' }} ++ {%- else %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' }} ++ {%- endif %} ++ {%- else %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' }} ++ {%- endif %} ++ {%- elif message['role'] in ['ipython', 'tool'] %} ++ {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} ++ {{- 'Here are the results from the tool:' + (message['content'] | trim) + '<|eot_id|>' }} ++ {%- elif message['role'] == 'assistant' %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++ {%- generation %} ++ {%- if message.get('tool_calls') is not none %} ++ {{- '[' }} ++ {%- for tool_call in message['tool_calls'] %} ++ {{- '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson }} ++ {%- if tool_call.get('id') is not none %} ++ {{- ', "id": "' + tool_call.id + '"' }} ++ {%- endif %} ++ {{- '}' }} ++ {%- if not loop.last %} ++ {{- ', ' }} ++ {%- endif %} ++ {%- endfor %} ++ {{- ']' }} ++ {{- '<|eot_id|>' }} ++ {%- else %} ++ {{- message['content'] | trim + '<|eot_id|>' }} ++ {%- endif %} ++ {%- endgeneration %} ++ {%- else %} ++ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' }} ++ {%- endif %} ++{%- endfor %} ++{%- if add_generation_prompt %} ++ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} ++{%- endif %} +diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/phi-4.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/phi-4.jinja +new file mode 100644 +index 000000000..33a466f88 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/templates/phi-4.jinja +@@ -0,0 +1,15 @@ ++{%- for message in messages %} ++ {%- if (message['role'] == 'system') %} ++ {{- '<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}} ++ {%- elif (message['role'] == 'user') %} ++ {{-'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}} ++ {%- elif (message['role'] == 'assistant') %} ++ {{- '<|im_start|>assistant<|im_sep|>' }} ++ {%- generation %} ++ {{- message['content'] + '<|im_end|>'}} ++ {%- endgeneration %} ++ {%- endif %} ++{%- endfor %} ++{%- if add_generation_prompt %} ++ {{- '<|im_start|>assistant<|im_sep|>' }} ++{%- endif %} +diff --git a/services/automodel/src/nmp/automodel/tasks/training/utils.py b/services/automodel/src/nmp/automodel/tasks/training/utils.py +new file mode 100644 +index 000000000..afbf73f97 +--- /dev/null ++++ b/services/automodel/src/nmp/automodel/tasks/training/utils.py +@@ -0,0 +1,93 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import logging ++import os ++ ++from nmp.common.jobs.constants import NEMO_JOB_ID_ENVVAR ++ ++from .schemas import GPUInfo ++ ++logger = logging.getLogger(__name__) ++ ++ ++def _get_architecture_name(major: int, minor: int) -> str: ++ """Map CUDA compute capability to architecture name. ++ ++ https://developer.nvidia.com/cuda-gpus ++ """ ++ if major == 3: ++ return "Kepler" ++ if major == 5: ++ return "Maxwell" ++ if major == 6: ++ return "Pascal" ++ if major == 7: ++ # 7.0/7.2 = Volta, 7.5 = Turing ++ if minor >= 5: ++ return "Turing" ++ return "Volta" ++ if major == 8: ++ return "Ampere" ++ if major == 9: ++ return "Hopper" ++ if major == 10: ++ return "Blackwell" ++ return f"Unknown (sm_{major}{minor})" ++ ++ ++def get_gpu_info() -> GPUInfo | None: ++ """Capture GPU architecture information.""" ++ try: ++ import torch ++ ++ if not torch.cuda.is_available(): ++ return None ++ ++ device_id = torch.cuda.current_device() ++ props = torch.cuda.get_device_properties(device_id) ++ major, minor = torch.cuda.get_device_capability(device_id) ++ ++ return GPUInfo( ++ architecture=_get_architecture_name(major, minor), ++ device_name=props.name, ++ memory_gb=props.total_memory / (1024**3), ++ cuda_version=str(torch.version.cuda), ++ ) ++ except Exception as e: ++ logger.warning(f"Failed to capture GPU info: {e}") ++ return None ++ ++ ++def generate_torchrun_flags_from_env() -> list[str]: ++ """Generate torchrun flags for distributed training.""" ++ # These values are typically injected by the Volcano/PyTorch operator ++ # or the Core Jobs Service when using DistributedGPUExecutionProvider. ++ master_addr = os.environ.get("MASTER_ADDR", "localhost") ++ master_port = os.environ.get("MASTER_PORT", "23456") # Default to port from volcano_job.py ++ node_rank = os.environ.get("NODE_RANK", os.environ.get("RANK", "0")) ++ num_nodes = os.environ.get("WORLD_SIZE", "1") ++ gpus_per_node = os.environ.get("GPUS_PER_NODE") ++ if gpus_per_node is None: ++ try: ++ import torch ++ ++ gpus_per_node = str(torch.cuda.device_count()) ++ except Exception as e: ++ logger.warning(f"Failed to determine number of GPUs: {e}, using default of 1") ++ gpus_per_node = "1" ++ ++ return [ ++ "--nnodes", ++ num_nodes, ++ "--nproc_per_node", ++ gpus_per_node, ++ "--node_rank", ++ node_rank, ++ "--rdzv_id", ++ os.environ.get(NEMO_JOB_ID_ENVVAR, "customizer-rdzv"), ++ "--rdzv_backend", ++ "c10d", ++ "--rdzv_endpoint", ++ f"{master_addr}:{master_port}", ++ ] +diff --git a/services/automodel/tests/test_adapter.py b/services/automodel/tests/test_adapter.py +new file mode 100644 +index 000000000..dfd1f44b4 +--- /dev/null ++++ b/services/automodel/tests/test_adapter.py +@@ -0,0 +1,35 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from nmp.automodel.adapter import automodel_spec_to_compiler_output ++from nmp.automodel.api.v2.jobs.schemas import DistillationTraining, SFTTraining ++ ++ ++def test_adapter_sft() -> None: ++ spec = automodel_spec_to_compiler_output( ++ { ++ "model": "meta/llama", ++ "dataset": {"training": "default/train"}, ++ "training": {"training_type": "sft", "finetuning_type": "lora"}, ++ "output": {"name": "out", "type": "adapter", "fileset": "out-fs"}, ++ }, ++ ) ++ assert isinstance(spec.training, SFTTraining) ++ assert spec.dataset == "default/train" ++ ++ ++def test_adapter_distillation() -> None: ++ spec = automodel_spec_to_compiler_output( ++ { ++ "model": "meta/llama", ++ "dataset": {"training": "default/train"}, ++ "training": { ++ "training_type": "distillation", ++ "finetuning_type": "all_weights", ++ "teacher_model": "meta/teacher", ++ }, ++ "output": {"name": "out", "type": "model", "fileset": "out-fs"}, ++ }, ++ ) ++ assert isinstance(spec.training, DistillationTraining) ++ assert spec.training.teacher_model == "meta/teacher" +diff --git a/services/automodel/tests/test_compiler.py b/services/automodel/tests/test_compiler.py +new file mode 100644 +index 000000000..9959de814 +--- /dev/null ++++ b/services/automodel/tests/test_compiler.py +@@ -0,0 +1,119 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++import json ++from datetime import datetime ++from pathlib import Path ++ ++from unittest.mock import AsyncMock, Mock ++ ++import pytest ++from nemo_platform import AsyncNeMoPlatform ++from nemo_platform.types.models.model_entity import ModelEntity ++from nmp.automodel.adapter import automodel_spec_to_compiler_output ++from nmp.automodel.compile import platform_job_config_compiler ++from nmp.automodel.images import DEFAULT_AUTOMODEL_IMAGE_REGISTRY, TASKS_IMAGE_NAME, TRAINING_IMAGE_NAME ++from nmp.common.entities.utils import get_random_id ++ ++ ++def _make_mock_model_entity( ++ workspace: str = "default", ++ name: str = "test-target", ++ fileset: str = "default/base-model", ++) -> ModelEntity: ++ return ModelEntity( ++ id=get_random_id("model"), ++ workspace=workspace, ++ name=name, ++ fileset=fileset, ++ trust_remote_code=False, ++ finetuning_type=None, ++ created_at=datetime.now(), ++ updated_at=datetime.now(), ++ ) ++ ++ ++@pytest.fixture ++def mock_sdk(): ++ sdk = Mock(spec=AsyncNeMoPlatform) ++ sdk.models = Mock() ++ sdk.models.retrieve = AsyncMock( ++ side_effect=lambda name, workspace, verbose=True: _make_mock_model_entity(workspace=workspace, name=name), ++ ) ++ sdk.files = Mock() ++ sdk.files.filesets = Mock() ++ sdk.files.filesets.retrieve = AsyncMock(return_value=Mock()) ++ return sdk ++ ++ ++@pytest.mark.asyncio ++async def test_platform_job_config_compiler_sft_lora(mock_sdk, monkeypatch): ++ monkeypatch.setattr( ++ "nmp.automodel.app.jobs.compiler.fetch_model_entity", ++ AsyncMock(return_value=_make_mock_model_entity()), ++ ) ++ contract_dir = Path(__file__).resolve().parents[3] / "tests" / "customizer-automodel-contract" / "input_configs" ++ input_path = contract_dir / "llama-3.2-1b" / "llama_3_2_1b_lora.json" ++ if not input_path.exists(): ++ pytest.skip("contract configs not present") ++ ++ raw = json.loads(input_path.read_text()) ++ plugin_shape = { ++ "model": raw["model"]["path"], ++ "dataset": {"training": "default/train-data"}, ++ "training": { ++ "training_type": "sft", ++ "finetuning_type": "lora", ++ "lora": { ++ "rank": raw["training"]["lora"]["rank"], ++ "alpha": raw["training"]["lora"]["alpha"], ++ "merge": False, ++ }, ++ "max_seq_length": raw["model"]["max_seq_length"], ++ }, ++ "schedule": { ++ "epochs": raw["schedule"]["epochs"], ++ "max_steps": raw["schedule"]["max_steps"], ++ }, ++ "batch": { ++ "global_batch_size": raw["batch"]["global_batch_size"], ++ "micro_batch_size": raw["batch"]["micro_batch_size"], ++ }, ++ "optimizer": {"learning_rate": raw["optimizer"]["learning_rate"]}, ++ "parallelism": { ++ "num_nodes": raw["parallelism"]["num_nodes"], ++ "num_gpus_per_node": raw["parallelism"]["num_gpus_per_node"], ++ "tensor_parallel_size": raw["parallelism"]["tensor_parallel_size"], ++ }, ++ "output": {"name": "test-out", "type": "adapter", "fileset": "test-out-fs"}, ++ } ++ compiler_spec = automodel_spec_to_compiler_output(plugin_shape) ++ spec = await platform_job_config_compiler(compiler_spec, "default", mock_sdk) ++ ++ steps = spec.steps if hasattr(spec, "steps") else spec["steps"] ++ assert len(steps) == 4 ++ training_step = steps[1] ++ training_name = training_step.name if hasattr(training_step, "name") else training_step["name"] ++ assert training_name == "customization-training-job" ++ training_cmd = ( ++ training_step.executor.container.command ++ if hasattr(training_step, "executor") ++ else training_step["executor"]["container"]["command"] ++ ) ++ assert "nmp.automodel.tasks.training" in " ".join(training_cmd) ++ download_cmd = steps[0].executor.container.command if hasattr(steps[0], "executor") else steps[0]["executor"]["container"]["command"] ++ assert download_cmd[-1] == "nmp.automodel.tasks.file_io" ++ ++ def _step_image(step) -> str: ++ if hasattr(step, "executor"): ++ return step.executor.container.image ++ return step["executor"]["container"]["image"] ++ ++ tasks_image = f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TASKS_IMAGE_NAME}" ++ training_image = f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TRAINING_IMAGE_NAME}" ++ assert _step_image(steps[0]).startswith(tasks_image) ++ assert _step_image(steps[1]).startswith(training_image) ++ assert _step_image(steps[2]).startswith(tasks_image) ++ assert _step_image(steps[3]).startswith(tasks_image) +diff --git a/services/automodel/tests/test_contract_configs.py b/services/automodel/tests/test_contract_configs.py +new file mode 100644 +index 000000000..0d808033f +--- /dev/null ++++ b/services/automodel/tests/test_contract_configs.py +@@ -0,0 +1,78 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Contract continuity: compile_automodel_config import path and optional snapshot check.""" ++ ++from __future__ import annotations ++ ++import json ++import subprocess ++import sys ++from pathlib import Path ++ ++import pytest ++ ++REPO_ROOT = Path(__file__).resolve().parents[3] ++CONTRACT_DIR = REPO_ROOT / "tests" / "customizer-automodel-contract" ++GENERATE_SCRIPT = CONTRACT_DIR / "generate_configs.py" ++ ++# v1 excludes embedding SFT until product expands scope. ++EMBEDDING_CONFIG_STEMS = {"embed_1b_lora", "embed_1b_full_sft"} ++ ++ ++@pytest.mark.skipif(not CONTRACT_DIR.is_dir(), reason="contract fixtures not in tree") ++def test_generate_configs_import_path() -> None: ++ """generate_configs.py must import compile_automodel_config from backends.config.""" ++ text = GENERATE_SCRIPT.read_text() ++ assert "backends.config import compile_automodel_config" in text ++ assert "backends.automodel.config" not in text ++ ++ ++@pytest.mark.skipif(not CONTRACT_DIR.is_dir(), reason="contract fixtures not in tree") ++@pytest.mark.parametrize( ++ "config_name", ++ [ ++ "llama_3_2_1b_lora", ++ "llama_3_2_1b_lora_packing", ++ "nemotron_nano_lora_packing", ++ ], ++) ++def test_contract_input_parses_as_training_step_config(config_name: str) -> None: ++ from nmp.automodel.tasks.training.schemas import TrainingStepConfig ++ ++ input_path = CONTRACT_DIR / "input_configs" / "llama-3.2-1b" / f"{config_name}.json" ++ if config_name.startswith("nemotron"): ++ input_path = CONTRACT_DIR / "input_configs" / "nemotron-nano" / f"{config_name}.json" ++ if not input_path.exists(): ++ pytest.skip(f"missing {input_path}") ++ ++ raw = json.loads(input_path.read_text()) ++ raw.pop("backend", None) ++ TrainingStepConfig.model_validate(raw) ++ ++ ++@pytest.mark.skipif(not CONTRACT_DIR.is_dir(), reason="contract fixtures not in tree") ++def test_contract_output_configs_up_to_date_excluding_embedding() -> None: ++ """Run generate_configs --check when nemo_automodel is available in the environment.""" ++ pytest.importorskip("nemo_automodel") ++ if not GENERATE_SCRIPT.is_file(): ++ pytest.skip("generate_configs.py missing") ++ ++ env = dict(**__import__("os").environ) ++ env["PYTHONPATH"] = str(REPO_ROOT / "services" / "automodel" / "src") ++ ++ result = subprocess.run( ++ [sys.executable, str(GENERATE_SCRIPT), "--check"], ++ cwd=CONTRACT_DIR, ++ env=env, ++ capture_output=True, ++ text=True, ++ ) ++ if result.returncode != 0: ++ combined = result.stdout + result.stderr ++ for stem in EMBEDDING_CONFIG_STEMS: ++ if stem in combined: ++ pytest.skip("contract check failed on embedding configs (excluded from v1)") ++ if "nemo_automodel" in combined and "ModuleNotFoundError" in combined: ++ pytest.skip("nemo_automodel not installed in test env (run in training image CI)") ++ pytest.fail(f"contract configs out of date:\n{combined}") +diff --git a/services/automodel/tests/test_images.py b/services/automodel/tests/test_images.py +new file mode 100644 +index 000000000..b62b39a9f +--- /dev/null ++++ b/services/automodel/tests/test_images.py +@@ -0,0 +1,46 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from nmp.automodel.config import AutomodelConfig ++from nmp.automodel.images import ( ++ DEFAULT_AUTOMODEL_IMAGE_REGISTRY, ++ TASKS_IMAGE_NAME, ++ TRAINING_IMAGE_NAME, ++ get_automodel_qualified_image, ++ get_tasks_image, ++ get_training_image, ++) ++ ++ ++def test_default_automodel_images_use_nvcr_dev_registry(monkeypatch): ++ monkeypatch.setattr("nmp.automodel.images.config", AutomodelConfig()) ++ ++ tasks = get_tasks_image() ++ training = get_training_image() ++ ++ assert tasks == f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TASKS_IMAGE_NAME}:local" ++ assert training == f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TRAINING_IMAGE_NAME}:local" ++ assert "/" in TASKS_IMAGE_NAME ++ ++ ++def test_automodel_image_registry_override(monkeypatch): ++ monkeypatch.setattr( ++ "nmp.automodel.images.config", ++ AutomodelConfig(image_registry="nvcr.io/0921617854601259/other-registry"), ++ ) ++ ++ assert ( ++ get_automodel_qualified_image(TASKS_IMAGE_NAME) ++ == "nvcr.io/0921617854601259/other-registry/nmp/automodel-tasks:local" ++ ) ++ ++ ++def test_automodel_full_image_override(monkeypatch): ++ monkeypatch.setattr( ++ "nmp.automodel.images.config", ++ AutomodelConfig( ++ tasks_image="nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-tasks:dev", ++ ), ++ ) ++ ++ assert get_tasks_image() == "nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-tasks:dev" +diff --git a/services/automodel/tests/test_job_context.py b/services/automodel/tests/test_job_context.py +new file mode 100644 +index 000000000..55efea6a1 +--- /dev/null ++++ b/services/automodel/tests/test_job_context.py +@@ -0,0 +1,73 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""Unit tests for NMPJobContext.""" ++ ++from pathlib import Path ++ ++import pytest ++from nmp.automodel.app.constants import DEFAULT_JOB_STORAGE_PATH, NMP_FILES_URL_ENVVAR, NMP_JOBS_URL_ENVVAR ++from nmp.automodel.app.jobs.context import ( ++ DEFAULT_ATTEMPT_ID, ++ DEFAULT_JOB_ID, ++ DEFAULT_STEP, ++ DEFAULT_TASK, ++ NMPJobContext, ++) ++from nmp.common.entities.constants import DEFAULT_WORKSPACE ++from nmp.common.jobs.constants import ( ++ DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH, ++ NEMO_JOB_ATTEMPT_ID_ENVVAR, ++ NEMO_JOB_ID_ENVVAR, ++ NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, ++ NEMO_JOB_STEP_ENVVAR, ++ NEMO_JOB_TASK_ENVVAR, ++ NEMO_JOB_WORKSPACE_ENVVAR, ++ PERSISTENT_JOB_STORAGE_PATH_ENVVAR, ++) ++ ++ ++class TestNMPJobContextFromEnv: ++ def test_uses_defaults_when_env_vars_not_set(self, monkeypatch: pytest.MonkeyPatch) -> None: ++ for var in ( ++ NEMO_JOB_WORKSPACE_ENVVAR, ++ NEMO_JOB_ID_ENVVAR, ++ NEMO_JOB_ATTEMPT_ID_ENVVAR, ++ NEMO_JOB_STEP_ENVVAR, ++ NEMO_JOB_TASK_ENVVAR, ++ NMP_JOBS_URL_ENVVAR, ++ NMP_FILES_URL_ENVVAR, ++ PERSISTENT_JOB_STORAGE_PATH_ENVVAR, ++ NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, ++ ): ++ monkeypatch.delenv(var, raising=False) ++ ++ ctx = NMPJobContext.from_env() ++ ++ assert ctx.workspace == DEFAULT_WORKSPACE ++ assert ctx.job_id == DEFAULT_JOB_ID ++ assert ctx.attempt_id == DEFAULT_ATTEMPT_ID ++ assert ctx.step == DEFAULT_STEP ++ assert ctx.task == DEFAULT_TASK ++ assert ctx.jobs_url is None ++ assert ctx.files_url is None ++ assert ctx.storage_path == Path(DEFAULT_JOB_STORAGE_PATH) ++ assert ctx.config_path == Path(DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH) ++ ++ def test_uses_env_vars_when_set(self, monkeypatch: pytest.MonkeyPatch) -> None: ++ monkeypatch.setenv(NEMO_JOB_WORKSPACE_ENVVAR, "test-workspace") ++ monkeypatch.setenv(NEMO_JOB_ID_ENVVAR, "job-123") ++ monkeypatch.setenv(NEMO_JOB_ATTEMPT_ID_ENVVAR, "attempt-5") ++ monkeypatch.setenv(NEMO_JOB_STEP_ENVVAR, "training") ++ monkeypatch.setenv(NEMO_JOB_TASK_ENVVAR, "train-model") ++ monkeypatch.setenv(NMP_JOBS_URL_ENVVAR, "http://jobs.example.com") ++ monkeypatch.setenv(NMP_FILES_URL_ENVVAR, "http://files.example.com") ++ monkeypatch.setenv(PERSISTENT_JOB_STORAGE_PATH_ENVVAR, "/custom/storage") ++ monkeypatch.setenv(NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, "/custom/config.json") ++ ++ ctx = NMPJobContext.from_env() ++ ++ assert ctx.workspace == "test-workspace" ++ assert ctx.job_id == "job-123" ++ assert ctx.normalized_task == "task-train-model" ++ assert ctx.jobs_url == "http://jobs.example.com" +diff --git a/services/automodel/tests/test_platform_client.py b/services/automodel/tests/test_platform_client.py +new file mode 100644 +index 000000000..c33b52cab +--- /dev/null ++++ b/services/automodel/tests/test_platform_client.py +@@ -0,0 +1,14 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from nmp.automodel.app.jobs.file_io.schemas import FileSetRef ++ ++ ++def test_fileset_ref_parse() -> None: ++ ref = FileSetRef.model_validate("acme-corp/my-dataset") ++ assert ref.workspace == "acme-corp" ++ assert ref.name == "my-dataset" ++ ++ bare = FileSetRef.model_validate("my-dataset") ++ assert bare.workspace is None ++ assert bare.name == "my-dataset" +diff --git a/services/automodel/tests/test_progress_reporter.py b/services/automodel/tests/test_progress_reporter.py +new file mode 100644 +index 000000000..b8eba5ebf +--- /dev/null ++++ b/services/automodel/tests/test_progress_reporter.py +@@ -0,0 +1,40 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++from __future__ import annotations ++ ++from pathlib import Path ++from unittest.mock import MagicMock ++ ++from nmp.automodel.app.jobs.context import NMPJobContext ++from nemo_platform import omit ++from nmp.automodel.tasks.progress_reporter import JobsServiceProgressReporter ++from nmp.common.jobs.schemas import PlatformJobStatus ++ ++ ++def test_progress_reporter_calls_sdk_create_or_update() -> None: ++ sdk = MagicMock() ++ ctx = NMPJobContext( ++ workspace="ws-a", ++ job_id="job-1", ++ attempt_id="attempt-0", ++ step="training", ++ task="train-model", ++ jobs_url="http://jobs.example.com", ++ files_url=None, ++ storage_path=Path("/tmp/job"), ++ config_path=Path("/tmp/job/config.json"), ++ ) ++ reporter = JobsServiceProgressReporter(sdk, ctx.workspace, ctx.job_id, ctx.step, ctx.normalized_task) ++ reporter.update_progress(PlatformJobStatus.ACTIVE, status_details={"phase": "training"}) ++ ++ sdk.jobs.tasks.create_or_update.assert_called_once_with( ++ ctx.normalized_task, ++ workspace=ctx.workspace, ++ job=ctx.job_id, ++ step=ctx.step, ++ status=PlatformJobStatus.ACTIVE.value, ++ status_details={"phase": "training"}, ++ error_details=omit, ++ error_stack=omit, ++ ) +diff --git a/services/automodel/tests/test_validators.py b/services/automodel/tests/test_validators.py +new file mode 100644 +index 000000000..a3f904d89 +--- /dev/null ++++ b/services/automodel/tests/test_validators.py +@@ -0,0 +1,22 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++import pytest ++from nmp.automodel.entities.validators import validate_fileset_uri ++ ++ ++def test_validate_fileset_workspace_name() -> None: ++ assert validate_fileset_uri("acme-corp/train-data") == "acme-corp/train-data" ++ ++ ++def test_validate_fileset_bare_name() -> None: ++ assert validate_fileset_uri("train-data") == "train-data" ++ ++ ++def test_validate_strips_legacy_fileset_prefix() -> None: ++ assert validate_fileset_uri("fileset://acme-corp/train-data") == "acme-corp/train-data" ++ ++ ++def test_validate_rejects_hf_protocol() -> None: ++ with pytest.raises(ValueError, match="Unsupported"): ++ validate_fileset_uri("hf://org/dataset") +diff --git a/services/core/jobs/src/nmp/core/jobs/config.py b/services/core/jobs/src/nmp/core/jobs/config.py +index ce2d30ee8..3c71a0fbd 100644 +--- a/services/core/jobs/src/nmp/core/jobs/config.py ++++ b/services/core/jobs/src/nmp/core/jobs/config.py +@@ -5,7 +5,7 @@ + + from typing import Self + +-from nmp.common.config import create_service_config_class, get_platform_config, get_service_config ++from nmp.common.config import Runtime, create_service_config_class, get_platform_config, get_service_config + from nmp.core.jobs.app.profiles import ExecutionProfileT + from nmp.core.jobs.controllers.backends.config import ( + DefaultExecutionProfileConfig, +@@ -30,6 +30,19 @@ class JobsServiceConfig(create_service_config_class("jobs")): # type: ignore + ) + reconcile_interval_seconds: int = Field(default=2, description="Interval in seconds for the job reconciler to run") + schedule_interval_seconds: int = Field(default=5, description="Interval in seconds for the job scheduler to run") ++ enable_subprocess_executor: bool | None = Field( ++ default=None, ++ description=( ++ "Register the subprocess/default execution profile. When unset, defaults to true for " ++ "docker/none runtimes and false for kubernetes." ++ ), ++ ) ++ ++ def resolved_enable_subprocess_executor(self) -> bool: ++ """Whether host subprocess execution is registered for default profiles.""" ++ if self.enable_subprocess_executor is not None: ++ return self.enable_subprocess_executor ++ return get_platform_config().runtime != Runtime.KUBERNETES + + @model_validator(mode="after") + def validate_executors(self) -> Self: +@@ -55,5 +68,6 @@ profiles = merge_executor_profiles( + get_default_executor_profiles_for_runtime( + runtime=get_platform_config().runtime, + defaults=config.executor_defaults, ++ enable_subprocess_executor=config.resolved_enable_subprocess_executor(), + ), + ) +diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py +index 0e137b8ad..1e5a2d7fc 100644 +--- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py ++++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py +@@ -41,8 +41,15 @@ class DefaultExecutionProfileConfig(BaseModel): + ) + + +-def get_default_executor_profiles_for_runtime(runtime: Runtime, defaults: DefaultExecutionProfileConfig) -> list: ++def get_default_executor_profiles_for_runtime( ++ runtime: Runtime, ++ defaults: DefaultExecutionProfileConfig, ++ *, ++ enable_subprocess_executor: bool | None = None, ++) -> list: + """Returns a list of default executor profiles based on the deployment runtime.""" ++ if enable_subprocess_executor is None: ++ enable_subprocess_executor = runtime != Runtime.KUBERNETES + + logger.debug("Getting default executors for runtime: %s", runtime) + executors = [] +@@ -87,9 +94,7 @@ def get_default_executor_profiles_for_runtime(runtime: Runtime, defaults: Defaul + ] + ) + +- # Subprocess execution is available for single-host runtimes only. Kubernetes deployments must opt in +- # explicitly so subprocess profiles do not appear on distributed service pods by default. +- if runtime != Runtime.KUBERNETES: ++ if enable_subprocess_executor: + executors.append( + SubprocessJobExecutionProfile( + provider="subprocess", +diff --git a/tests/customizer-automodel-contract/generate_configs.py b/tests/customizer-automodel-contract/generate_configs.py +index 49979217d..65a8b9a56 100644 +--- a/tests/customizer-automodel-contract/generate_configs.py ++++ b/tests/customizer-automodel-contract/generate_configs.py +@@ -48,17 +48,20 @@ import yaml + + SCRIPT_DIR = Path(__file__).resolve().parent + REPO_ROOT = SCRIPT_DIR.parent.parent ++AUTOMODEL_SRC = REPO_ROOT / "services" / "automodel" / "src" + CUSTOMIZER_SRC = REPO_ROOT / "services" / "customizer" / "src" + +-if CUSTOMIZER_SRC.is_dir(): ++if AUTOMODEL_SRC.is_dir(): ++ sys.path.insert(0, str(AUTOMODEL_SRC)) ++elif CUSTOMIZER_SRC.is_dir(): + sys.path.insert(0, str(CUSTOMIZER_SRC)) + else: +- sys.path.insert(0, "/app/services/customizer/src") ++ sys.path.insert(0, "/app/services/automodel/src") + +-from nmp.customizer.app.constants import V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES # noqa: E402 +-from nmp.customizer.app.jobs.context import NMPJobContext # noqa: E402 +-from nmp.customizer.tasks.training.backends.automodel.config import compile_automodel_config # noqa: E402 +-from nmp.customizer.tasks.training.schemas import TrainingStepConfig # noqa: E402 ++from nmp.automodel.app.constants import V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES # noqa: E402 ++from nmp.automodel.app.jobs.context import NMPJobContext # noqa: E402 ++from nmp.automodel.tasks.training.backends.config import compile_automodel_config # noqa: E402 ++from nmp.automodel.tasks.training.schemas import TrainingStepConfig # noqa: E402 + + INPUT_DIR = SCRIPT_DIR / "input_configs" + OUTPUT_DIR = SCRIPT_DIR / "output_configs" +diff --git a/tests/smoke_gpu/conftest.py b/tests/smoke_gpu/conftest.py +index ae32d9c19..6e0c602ba 100644 +--- a/tests/smoke_gpu/conftest.py ++++ b/tests/smoke_gpu/conftest.py +@@ -9,3 +9,9 @@ def pytest_configure(config): + "markers", "smoke_customizer_automodel: Import smoke tests for the customizer-automodel image" + ) + config.addinivalue_line("markers", "smoke_customizer_rl: Import smoke tests for the customizer-rl image") ++ config.addinivalue_line( ++ "markers", "smoke_nmp_automodel_tasks: Import smoke tests for the nmp/automodel-tasks image" ++ ) ++ config.addinivalue_line( ++ "markers", "smoke_nmp_automodel_training: Import smoke tests for the nmp/automodel-training image" ++ ) +diff --git a/tests/smoke_gpu/test_nemo_automodel.py b/tests/smoke_gpu/test_nemo_automodel.py +new file mode 100644 +index 000000000..6f3157872 +--- /dev/null ++++ b/tests/smoke_gpu/test_nemo_automodel.py +@@ -0,0 +1,42 @@ ++# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. ++# SPDX-License-Identifier: Apache-2.0 ++ ++"""NeMo Automodel image import smoke tests. ++ ++Built as part of the nmp-automodel docker bake group (smoke-test stage) and run ++on a CPU runner - no GPU hardware required. ++""" ++ ++import pytest ++ ++ ++def test_torch_importable(): ++ import torch # noqa: F401 ++ ++ ++def test_transformers_importable(): ++ import transformers # noqa: F401 ++ ++ ++def test_mamba_ssm_importable(): ++ import mamba_ssm # noqa: F401 ++ ++ ++def test_causal_conv1d_importable(): ++ import causal_conv1d # noqa: F401 ++ ++ ++def test_bitsandbytes_importable(): ++ import bitsandbytes # noqa: F401 ++ ++ ++@pytest.mark.smoke_nmp_automodel_tasks ++def test_nmp_automodel_tasks_importable(): ++ from nmp.automodel.tasks import file_io # noqa: F401 ++ from nmp.automodel.tasks.model_entity import __main__ as model_entity_main # noqa: F401 ++ ++ ++@pytest.mark.smoke_nmp_automodel_training ++def test_nmp_automodel_training_importable(): ++ import nemo_automodel # noqa: F401 ++ from nmp.automodel.tasks.training import __main__ as training_main # noqa: F401 +diff --git a/uv.lock b/uv.lock +index 61041b551..caddbda93 100644 +--- a/uv.lock ++++ b/uv.lock +@@ -41,6 +41,8 @@ members = [ + "nemo-agents-plugin", + "nemo-anonymizer-plugin", + "nemo-auditor-plugin", ++ "nemo-automodel-plugin", ++ "nemo-customizer-plugin", + "nemo-data-designer-plugin", + "nemo-evaluator-plugin", + "nemo-evaluator-sdk", +@@ -55,6 +57,7 @@ members = [ + "nemoplatform", + "nmp-auditor", + "nmp-auth", ++ "nmp-automodel", + "nmp-build-tools", + "nmp-common", + "nmp-core-mcp", +@@ -5376,6 +5379,84 @@ dev = [ + { name = "ruff", specifier = ">=0.11.8" }, + ] + ++[[package]] ++name = "nemo-automodel-plugin" ++version = "0.1.0" ++source = { editable = "plugins/nemo-automodel" } ++dependencies = [ ++ { name = "nemo-platform", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-platform-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nmp-automodel", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pydantic", extra = ["email"], marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pydantic-settings", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "typer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++] ++ ++[package.dev-dependencies] ++dev = [ ++ { name = "fastapi", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "httpx", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pytest", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pytest-asyncio", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "ruff", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++] ++ ++[package.metadata] ++requires-dist = [ ++ { name = "nemo-platform", editable = "packages/nemo_platform" }, ++ { name = "nemo-platform-plugin", editable = "packages/nemo_platform_plugin" }, ++ { name = "nmp-automodel", editable = "services/automodel" }, ++ { name = "pydantic", specifier = ">=2.10.6" }, ++ { name = "pydantic-settings", specifier = ">=2.6.1" }, ++ { name = "typer", specifier = ">=0.12.5" }, ++] ++ ++[package.metadata.requires-dev] ++dev = [ ++ { name = "fastapi", specifier = ">=0.115.0" }, ++ { name = "httpx", specifier = ">=0.27.0" }, ++ { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, ++ { name = "pytest", specifier = ">=8.3.4" }, ++ { name = "pytest-asyncio", specifier = ">=0.25.3" }, ++ { name = "ruff", specifier = ">=0.11.8" }, ++] ++ ++[[package]] ++name = "nemo-customizer-plugin" ++version = "0.1.0" ++source = { editable = "plugins/nemo-customizer" } ++dependencies = [ ++ { name = "nemo-platform", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-platform-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pydantic", extra = ["email"], marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "typer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++] ++ ++[package.dev-dependencies] ++dev = [ ++ { name = "fastapi", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pytest", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pytest-asyncio", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "ruff", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++] ++ ++[package.metadata] ++requires-dist = [ ++ { name = "nemo-platform", editable = "packages/nemo_platform" }, ++ { name = "nemo-platform-plugin", editable = "packages/nemo_platform_plugin" }, ++ { name = "pydantic", specifier = ">=2.10.6" }, ++ { name = "typer", specifier = ">=0.12.5" }, ++] ++ ++[package.metadata.requires-dev] ++dev = [ ++ { name = "fastapi", specifier = ">=0.115.0" }, ++ { name = "pytest", specifier = ">=8.3.4" }, ++ { name = "pytest-asyncio", specifier = ">=0.25.3" }, ++ { name = "ruff", specifier = ">=0.11.8" }, ++] ++ + [[package]] + name = "nemo-data-designer-plugin" + version = "0.1.0" +@@ -7155,6 +7236,8 @@ core-services = [ + { name = "nemo-agents-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-anonymizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-auditor-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-automodel-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-data-designer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-evaluator-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-guardrails-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +@@ -7255,6 +7338,8 @@ enabled-plugins = [ + { name = "nemo-agents-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-anonymizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-auditor-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-automodel-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-data-designer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-evaluator-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-guardrails-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +@@ -7264,6 +7349,8 @@ functional-services = [ + { name = "nemo-agents-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-anonymizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-auditor-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-automodel-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-data-designer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-evaluator-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-guardrails-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +@@ -7361,6 +7448,8 @@ core-services = [ + { name = "nemo-agents-plugin", editable = "plugins/nemo-agents" }, + { name = "nemo-anonymizer-plugin", editable = "plugins/nemo-anonymizer" }, + { name = "nemo-auditor-plugin", editable = "plugins/nemo-auditor" }, ++ { name = "nemo-automodel-plugin", editable = "plugins/nemo-automodel" }, ++ { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, + { name = "nemo-data-designer-plugin", editable = "plugins/nemo-data-designer" }, + { name = "nemo-evaluator-plugin", editable = "plugins/nemo-evaluator" }, + { name = "nemo-guardrails-plugin", editable = "plugins/nemo-guardrails" }, +@@ -7464,6 +7553,8 @@ enabled-plugins = [ + { name = "nemo-agents-plugin", editable = "plugins/nemo-agents" }, + { name = "nemo-anonymizer-plugin", editable = "plugins/nemo-anonymizer" }, + { name = "nemo-auditor-plugin", editable = "plugins/nemo-auditor" }, ++ { name = "nemo-automodel-plugin", editable = "plugins/nemo-automodel" }, ++ { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, + { name = "nemo-data-designer-plugin", editable = "plugins/nemo-data-designer" }, + { name = "nemo-evaluator-plugin", editable = "plugins/nemo-evaluator" }, + { name = "nemo-guardrails-plugin", editable = "plugins/nemo-guardrails" }, +@@ -7473,6 +7564,8 @@ functional-services = [ + { name = "nemo-agents-plugin", editable = "plugins/nemo-agents" }, + { name = "nemo-anonymizer-plugin", editable = "plugins/nemo-anonymizer" }, + { name = "nemo-auditor-plugin", editable = "plugins/nemo-auditor" }, ++ { name = "nemo-automodel-plugin", editable = "plugins/nemo-automodel" }, ++ { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, + { name = "nemo-data-designer-plugin", editable = "plugins/nemo-data-designer" }, + { name = "nemo-evaluator-plugin", editable = "plugins/nemo-evaluator" }, + { name = "nemo-guardrails-plugin", editable = "plugins/nemo-guardrails" }, +@@ -7682,6 +7775,42 @@ dev = [ + { name = "typer", specifier = ">=0.9.0" }, + ] + ++[[package]] ++name = "nmp-automodel" ++version = "0.1.0" ++source = { editable = "services/automodel" } ++dependencies = [ ++ { name = "aiofiles", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "httpx", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nemo-platform", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "nmp-common", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pydantic", extra = ["email"], marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pydantic-settings", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++] ++ ++[package.optional-dependencies] ++dev = [ ++ { name = "nmp-testing", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pytest", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pytest-asyncio", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++ { name = "pytest-mock", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, ++] ++ ++[package.metadata] ++requires-dist = [ ++ { name = "aiofiles", specifier = ">=24.1.0" }, ++ { name = "httpx", specifier = ">=0.27.0" }, ++ { name = "nemo-platform", editable = "packages/nemo_platform" }, ++ { name = "nmp-common", editable = "packages/nmp_common" }, ++ { name = "nmp-testing", marker = "extra == 'dev'", editable = "packages/nmp_testing" }, ++ { name = "pydantic", specifier = ">=2.10.6" }, ++ { name = "pydantic-settings", specifier = ">=2.6.1" }, ++ { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.4" }, ++ { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.25.3" }, ++ { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" }, ++] ++provides-extras = ["dev"] ++ + [[package]] + name = "nmp-build-tools" + version = "0.0.0" diff --git a/plugins/nemo-automodel/README.md b/plugins/nemo-automodel/README.md new file mode 100644 index 00000000..3c6df3a5 --- /dev/null +++ b/plugins/nemo-automodel/README.md @@ -0,0 +1,30 @@ +# nemo-automodel-plugin + +Automodel training contributor under `/apis/customization/v2/workspaces/{workspace}/automodel/`. + +Requires **`nemo-customizer-plugin`** at runtime (router + `client.customization` SDK) and **`nmp-automodel`** (compiler/tasks). The Automodel plugin does not declare a pyproject dependency on the customizer plugin — install both via root `enabled-plugins`: + +```bash +uv sync --group enabled-plugins +``` + +## CLI + +Verbs are mounted directly on the contributor (no `jobs` subgroup): + +```bash +nemo customization automodel explain +nemo customization automodel submit path/to/job.json +nemo customization automodel submit path/to/job.json -w acme-corp +nemo customization automodel submit path/to/job.json --cluster my-cluster +``` + +`run` is registered but **always fails** — Automodel training is submit-only (platform API / Docker GPU jobs), not local subprocess execution: + +```bash +nemo customization automodel run path/to/job.json # exits with error +``` + +Other customization backends may still use `nemo customization jobs submit ...`. + +Job JSON uses the simplified `AutomodelJobInput` schema (see `nemo_automodel_plugin/schema.py`). Submit posts to `/apis/customization/v2/workspaces/{workspace}/automodel/jobs`. diff --git a/plugins/nemo-automodel/SCOPE.md b/plugins/nemo-automodel/SCOPE.md new file mode 100644 index 00000000..1d5f8c85 --- /dev/null +++ b/plugins/nemo-automodel/SCOPE.md @@ -0,0 +1,967 @@ +# NeMo Automodel Plugin — Work Scope + +**Start here:** [Implementation order](#implementation-order) (sequence, checklists, success criteria). + +This document scopes the work to replace the legacy Customizer Automodel path with a first-party **NeMo Automodel plugin** (customization **contributor**), the **`nemo-customizer-plugin`** router at `/apis/customization`, and the **`nmp-automodel`** task/compiler package (no standalone HTTP server). Legacy `Platform/services/customizer/` is reference only. New work: `plugins/nemo-customizer/`, `plugins/nemo-automodel/`, `services/automodel/`. + +Training is powered by the upstream **`nemo_automodel`** library (repo: `Automodel/` at workspace root, NGC image `nvcr.io/nvidia/nemo-automodel:25.11.00`). + +--- + +## Implementation order + +Canonical sequencing for this scope. **Work breakdown** (below) and design sections add detail; checklists live here only. + +### Sequence overview + +| Step | Focus | Package / area | Blocks | +|------|--------|----------------|--------| +| **0** | Design lock + platform Jobs flag | cross-cutting | — | +| **1** | Customization router | `plugins/nemo-customizer` | Automodel HTTP (step 4) | +| **2** | Task/compiler library | `services/automodel` (`nmp-automodel`) | Images (step 3), contributor compile (step 4) | +| **3** | Container images | `nmp-automodel` Dockerfiles | E2E GPU runs | +| **4** | Automodel plugin + Docker gate | `plugins/nemo-automodel` | CLI submit (step 5), integration (step 6) | +| **5** | CLI submit path | `nemo-automodel` + router CLI | — | +| **6** | Tests & contracts | `Platform/tests/...` | — | +| **7** | SDK, OpenAPI, docs, deploy | platform + plugins | — | + +**Parallel OK:** Step 0 Jobs flag with step 1–2. Step 2 compiler port with step 1 router (after contributor protocol is sketched). + +```mermaid +flowchart LR + S0[Step 0 Design lock] + S1[Step 1 nemo-customizer] + S2[Step 2 nmp-automodel] + S3[Step 3 Images] + S4[Step 4 nemo-automodel plugin] + S5[Step 5 CLI] + S6[Step 6 Tests] + S7[Step 7 Docs deploy] + S0 --> S1 + S0 --> S2 + S1 --> S4 + S2 --> S3 + S2 --> S4 + S3 --> S6 + S4 --> S5 + S4 --> S6 + S5 --> S7 + S6 --> S7 +``` + +--- + +### Step 0 — Design lock & platform prerequisites + +Lock names, routes, schemas, and cross-cutting Jobs config before feature PRs. First implementation PR can be plugin + `nmp-automodel` without Studio migration. + +**Design lock checklist** + +- [x] **Name & routes:** Router `NemoService.name = customization`; Automodel contributor prefix `v2/workspaces/{workspace}/automodel` → `/apis/customization/v2/workspaces/{workspace}/automodel/...`; CLI `nemo customization automodel` — [URL routing](#url-routing-decided), [Customization router](#customization-router-in-scope--v1). +- [x] **Workspace contract:** Path `{workspace}` authoritative; spec uses workspace-relative names + optional `ws/name` qualifiers; dataset URI rules documented; **no** `workspace` key in JSON body — [Workspace scoping](#workspace-scoping-required). +- [x] **Simplified JSON schema:** Publish `AutomodelJobInput` (v1) for POST/CLI; `AutomodelJobOutput` for stored/GET; `extra="forbid"` — [Simplified JSON spec](#simplified-json-spec-draft--automodeljobinput-only). +- [x] **Schema validators (legacy parity):** Reject `output_model` with message to use `output` (legacy `CustomizationJobInput`); `model_config` / field validators for distillation-only fields when `training_type: sft`. +- [x] **Dataset shape:** `{ training, validation? }` fileset URIs; `to_spec()` runs `check_dataset_access` per ref (port `platform_client`) — legacy API used a single `dataset` string; mapping documented in migration guide. +- [x] **Integrations:** `wandb` / `mlflow` accept `api_key_secret` (`SecretRef`) plus enabled/project fields — not only `null` placeholders. +- [x] **v1 exclusions (locked):** `deployment_config` (post-train NIM deploy), embedding-model SFT, DPO/GRPO — see [Decisions](#decisions-resolved). +- [x] **Input vs canonical spec (Option A):** Two schemas + `to_spec()` — port `transform_input_to_output` — [Input vs canonical spec](#input-vs-canonical-spec--decided-option-a). +- [x] **Deprecation / Studio:** Legacy customizer not in default `AVAILABLE_SERVICES`; UI feature-flagged off — [Deprecation](#deprecation--platform-spin-up-and-studio-verified). + +**Workspace registration (do before first integration test):** + +- [x] Add `plugins/nemo-customizer`, `plugins/nemo-automodel`, `services/automodel` to root `Platform/pyproject.toml` workspace members. +- [x] Add `nemo-customizer-plugin` and `nemo-automodel-plugin` to `[dependency-groups] enabled-plugins` (pattern: `nemo-evaluator-plugin`). + +**Platform Jobs — `jobs.enable_subprocess_executor`** (cross-cutting, not Automodel-only; rationale in [Platform jobs: `runtime` vs step executors](#platform-jobs-runtime-vs-step-executors)): + +- [x] Add field to `JobsServiceConfig` (`Platform/services/core/jobs/src/nmp/core/jobs/config.py`). +- [x] Gate `SubprocessJobExecutionProfile` in `get_default_executor_profiles_for_runtime()` (K8s default `false`, docker local default `true`). +- [ ] Document in `Platform/services/core/jobs/README.md`. +- [ ] Expose in `packages/nmp_platform/config/local.yaml` and `nmp_platform_runner` local config. +- [ ] `GET /v2/execution-profiles` reflects the flag. + +--- + +### Step 1 — `nemo-customizer` (blocks Automodel HTTP) + +**Problem:** `discover_services()` maps each `nemo.services` key to one `/apis//` mount — only one owner for `customization`. Training backends (Automodel, RL, Megatron, Unsloth) must share one URL tree without a monolithic `nmp-customizer` or per-backend top-level services. + +**Solution:** New package `plugins/nemo-customizer/` (`nemo_customizer`) ships the sole `nemo.services` → `customization` registration. Backends register as **contributors** via `nemo.customization.contributors`. Full design: [Customization router](#customization-router-in-scope--v1). + +**Router behavior (implement in this step):** + +1. `discover_customization_contributors()` — fault-isolated; allowlist `NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST` (or `NEMO_PLUGIN_ALLOWLIST`). +2. **Zero contributors** → fail startup with clear error. +3. `CustomizationRouterService.get_routers()` — merge `RouterSpec` lists; **`dependencies`** = union of contributor + platform deps. +4. `CustomizationCLI.get_cli()` — `typer.Typer(name="customization")` + mount contributor subgroups (`automodel`, …). +5. OpenAPI / SDK — single service name `customization` when router + ≥1 contributor enabled. +6. **Route collision guard** — distinct segment per contributor under `.../workspaces/{workspace}/`; legacy `.../jobs` unmounted in v1. + +**`nemo-customizer-plugin` pyproject.toml:** + +```toml +[project.entry-points."nemo.services"] +customization = "nemo_customizer.router:CustomizationRouterService" + +[project.entry-points."nemo.cli"] +customization = "nemo_customizer.cli:CustomizationCLI" +``` + +**Deliverables** + +- [x] `CustomizationContributor` protocol in `nemo_platform_plugin/customization_contributor.py`; `discover_customization_contributors()` in `nemo_platform_plugin/discovery.py` (fault-isolated via `discover_entry_points`; allowlist `NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST` or `NEMO_PLUGIN_ALLOWLIST`). `nemo_customizer/discovery.py` re-exports for backward compatibility. +- [x] **Zero contributors:** fail router startup with a clear error (do not mount an empty `/apis/customization` tree silently). +- [x] `CustomizationRouterService` + `CustomizationCLI` (merge contributors); **`dependencies`** = union of all contributor `dependencies` plus platform deps (`entities`, `jobs`, `auth`, …). +- [x] Entry points: `nemo.services` + `nemo.cli` → key `customization`. +- [x] Unit tests: two fake contributors → merged routes; prefix collision detection; zero contributors → startup error. +- [x] `OPENAPI_SERVICES` / registry: include `customization` when router plugin enabled **and** ≥1 contributor discovered. +- [x] `docs/CUSTOMIZATION.md` — contributor author guide (RL / Megatron / Unsloth). +- [x] Workspace members + `enabled-plugins` — [Step 0 workspace registration](#step-0--design-lock--platform-prerequisites). + +**Out of scope:** Legacy `POST .../workspaces/{ws}/jobs` multi-backend path; Studio cutover. + +--- + +### Step 2 — `nmp-automodel` package core + +`Platform/services/automodel/` — Python package **`nmp-automodel`**: compilers, task entrypoints, Dockerfiles. **No HTTP server** (unlike legacy `customizer-server`). Reference port: `Platform/services/customizer/` (trim multi-backend paths only). + +**4-step `PlatformJobSpec` pipeline** (Automodel-only): + +1. `file_io` (CPU) — download model + datasets (`nmp/automodel-tasks` image). +2. Training (GPU) — `finetune.py` + `nemo_automodel` recipes (SFT + KD); `nmp/automodel-training` image. +3. `file_io` upload. +4. `model_entity` — register model in Models service (behavior unchanged from legacy). + +| Area | Source | Action | +|------|--------|--------| +| Automodel config compiler | `tasks/training/backends/automodel/config.py` | Move; drop non-automodel imports; SFT + `_configure_kd()` | +| Training runner/backend | `backend.py`, `finetune.py`, `callbacks.py`, `checkpoints.py` | Move; keep `JobsServiceProgressReporter` + `TrainingProgressCallback` (rank-0) | +| Training step compiler | `app/jobs/training/compiler.py` | **Strip** to automodel-only; fixed `nmp/automodel-training` image ref | +| Job compiler | `app/jobs/compiler.py` | **Strip** DPO/RL/`nemo_rl`/`megatron_bridge`; keep distillation (KD); 4-step only | +| File I/O tasks | `tasks/file_io/` | Ported: `run.py`, `callbacks.py`, `utils.py`, `progress_reporter.py` | +| Model entity task | `tasks/model_entity/` | Move unchanged behavior | +| Schemas | `api/v2/jobs/schemas.py` | `AutomodelJobInput` + `AutomodelJobOutput` (+ sub-models) for plugin `to_spec` / compiler | + +**Deliverables** + +- [x] `nmp-automodel` installable; task entry points via console scripts + `nemo-platform run task --task nmp.automodel.tasks.*`. +- [x] Unit tests: adapter + compiler (`services/automodel/tests/`); contract `generate_configs.py` imports `nmp.automodel`. +- [x] Prove `PlatformJobSpec` generation for SFT (4-step pipeline, `nmp.automodel.tasks.*` commands, `nmp/automodel-training` / `nmp/automodel-tasks` images). +- [x] `validate_for_training()` on legacy `CustomizationJobOutput` (compiler); plugin `AutomodelJobOutput` has parallel validator in `nemo_automodel_plugin.schema`. +- [x] `platform_client.py` — `fetch_model_entity`, `check_dataset_access`. +- [x] `_resolve_v4_compatible()` in training compiler. +- [x] Task modules `nmp.automodel.tasks.{file_io,training,model_entity}`; compiled steps use `nmp.automodel.tasks.*`. +- [x] `AutomodelConfig.default_training_execution_profile` (`NMP_AUTOMODEL_*`); adapter + compile wrapper apply request `profile`. + +**Internal Jobs callback path** (not a new public route — same contract as legacy customizer): + +- [x] `NMPJobContext` env vars for job id, step, workspace, task name. +- [x] `JobsServiceProgressReporter` / `TrainingProgressCallback` → `sdk.jobs.tasks.create_or_update` (rank-0 only). +- [ ] Document as internal; exclude from public OpenAPI if auxiliary routes are added. + +*Optional later:* webhook-style callbacks — out of initial scope. + +--- + +### Step 3 — Container images + +Two runtime images (`nmp/automodel-tasks`, `nmp/automodel-training`) built from `nmp/automodel-base` (PyTorch + `nemo_automodel` deps), published under `nvcr.io/0921617854601259/nemo-platform-dev/nmp/...` — not the upstream NGC `nvcr.io/nvidia/nemo-automodel` training container name and not full `nmp-customizer` / RL / Megatron stack. Do **not** reuse or extend `customizer-automodel` during transition. + +| Image key | Dockerfile | Used by | Contents | +|-----------|------------|---------|----------| +| `nmp/automodel-training` | `Dockerfile.nmp-automodel-training` | GPU training step | `nmp/automodel-base` + `nmp-automodel` finetune backend (SFT + KD recipes) | +| `nmp/automodel-tasks` | `Dockerfile.nmp-automodel-tasks` | CPU steps (`file_io`, `model_entity`) | Slim glue; task entrypoints without customizer API server / RL / Megatron | + +**Deliverables** + +- [ ] Wire both keys in plugin `get_qualified_image()` / `NMP_AUTOMODEL_*` env overrides. +- [ ] CI: smoke import on **training** image (pattern: `Platform/tests/smoke_gpu/test_customizer_automodel.py`); lighter smoke on **tasks** image. +- [ ] Plugin README: size/dependency audit vs `customizer-automodel`. +- [ ] Helm/assets: image refs (Studio cutover to new URLs still out of scope). + +--- + +### Step 4 — `nemo-automodel` plugin (contributor + job) + +Plugin HTTP only — merged by router at `/apis/customization/.../automodel/...`. Requires **step 1** (`nemo-customizer-plugin`) in workspace. **`compile()`** depends on **step 2** (`nmp-automodel`). + +**Automodel plugin `pyproject.toml` (contributor — not `nemo.services`):** + +```toml +[project.entry-points."nemo.customization.contributors"] +automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" + +[project.entry-points."nemo.jobs"] +"customization.automodel.jobs" = "nemo_automodel_plugin.jobs.jobs:AutomodelJob" +``` + +**Deliverables** + +- [x] **pyproject.toml:** `nemo-platform-plugin`, `nmp-automodel` (no `nemo-customizer-plugin` wheel dep — router installed via `enabled-plugins` only). Entry points: + - `nemo.customization.contributors` → `AutomodelContributor` (`automodel`) + - `nemo.jobs` → `customization.automodel.jobs` → `AutomodelJob` + - optional `nemo.docs` (no `nemo.services` / no top-level `nemo.cli`) +- [x] **`AutomodelContributor.get_routers()`** — optional `.../automodel/healthz`; mount jobs via `add_job_routes` (see wiring below); prefix `v2/workspaces/{workspace}/automodel`; `job_collection_path = "/automodel/jobs"`. +- [x] **`add_job_routes` wiring (required):** + - `service_name="customization"` — Jobs `source`, list filters, and OpenAPI service segment (default `_derive_service_name()` → `nemo-automodel-plugin` is **wrong**). + - `generate_job_name=generate_automodel_id` — `automodel-{uuid.hex[:12]}` when body omits `name` (same pattern as legacy `generate_customization_id`). + - `route_options=[JobRouteOption.CORE]` — create/list/get/delete/status/cancel/results; **no** PAUSE_RESUME in v1 (legacy parity). + - `default_profile` from plugin config when spec omits `training.execution_profile`. + - Request-body `profile` on `BaseJobRequest` — **deferred** (platform `add_job_routes` still drops it); v1 uses **`training.execution_profile`** in JSON only. +- [x] **`AutomodelJob`:** `description` set; `input_spec_schema` / `spec_schema` / `to_spec()` (Option A); `compile()` on `AutomodelJobOutput` only; `dependencies`: `entities`, `auth`, `jobs`, `secrets`, `files`, `models`. +- [ ] **Job envelope:** `description`, `project`, `ownership`, `custom_fields` — inherited from `job_route_factory` (no Automodel-specific fields); document in README. +- [x] **`get_cli()`** — `automodel` Typer subgroup via `add_job_commands` (`jobs` → `run` / `submit` / `explain` to `.../automodel/jobs`). Data Designer–style `cli/inputs.py` simplified JSON is [Step 5](#step-5--cli-submit-path), not required for Step 4. +- [x] SDK: `nemo-customizer-plugin` owns `nemo.sdk` → `customization`; composes `client.customization.automodel` from `nemo-automodel-plugin`. `nemo.docs` if user docs ship with plugin. +- [x] Workspace members + `enabled-plugins` — [Step 0 workspace registration](#step-0--design-lock--platform-prerequisites). + +**Docker enforcement & GPU validation** (`nemo_platform_plugin.jobs.docker` today: `validate_gpu_available_for_docker` only when `runtime == DOCKER` and reserved GPU list is empty — **extend for all Automodel jobs**): + +- [x] At **compile** (plugin `compile()` or shared helper): require `NemoPlatformConfig.runtime == DOCKER`. +- [x] Require `validate_docker_available()` (daemon reachable). +- [x] Require GPU pool configured (reuse or extend `validate_gpu_available_for_docker`). +- [x] `PlatformJobCompilationError` → 422, e.g. *“Automodel training requires `platform.runtime: docker` with GPU-backed container execution (Docker daemon reachable and GPUs configured).”* +- [x] Do **not** silently downgrade `platform.runtime` to `NONE` for this plugin. +- [ ] Do **not** conflate with `jobs.enable_subprocess_executor` — Automodel never schedules `subprocess` training steps. + +--- + +### Step 5 — CLI submit path + +First-class CLI for simplified JSON jobs (pattern: Data Designer `[CONFIG_SOURCE]` → canonical spec in `plugins/nemo-data-designer/.../cli/inputs.py`). Commands hang under `nemo customization automodel` (router CLI + contributor subgroup). + +**Submit URL** (via `nemo_platform_plugin.commands` job submit helper): + +`/apis/customization/v2/workspaces/{workspace}/automodel/jobs` + +Custom wrappers must **forward** `--workspace` / `-w` to the framework callback (default `"default"` for local dev only). + +**Deliverables** + +- [x] `nemo customization automodel jobs submit --workspace ` — `cli/inputs.py` validates `AutomodelJobInput` and POSTs to `.../automodel/jobs` (`tests/test_cli.py`). +- [x] `jobs explain` — exposes `input_spec_schema` + `spec_schema` via framework `explain` (`tests/test_cli.py`). +- [x] CLI tests: `-w` / `--workspace` in submit URL (`submit_path_for` + mocked `submit_remote`). + +--- + +### Step 6 — Tests & contract continuity + +Relocate contract tests from legacy `customizer-automodel` path; validate router + contributor + compiler together. + +**Deliverables** + +- [x] Contract script import path fixed (`generate_configs.py` → `backends.config`); `services/automodel/tests/test_contract_configs.py` parses SFT/packing inputs + optional `--check` (embedding gated/skipped for v1). +- [x] Unit/API: Automodel + customization router routes under `/v2/workspaces/{workspace}/automodel/...` (`plugins/nemo-automodel/tests/test_api.py`). +- [x] Integration: compile-only via `services/automodel/tests/test_compiler.py` (contract fixture when present); CLI submit mocked in `test_cli.py`. +- [ ] Agentic smoke: adapt `Platform/tests/agentic-use/customizer-lora-job-cli` → `nemo customization automodel` CLI. +- [ ] E2E: job completes → **Model** entity exists → fileset populated → LoRA metadata when `finetuning_type=lora`. +- [x] Workspace isolation: routes scoped by `{workspace}` path segment (`test_api.py`); full cross-workspace API test deferred to Jobs service integration. + +--- + +### Step 7 — SDK, OpenAPI, docs & rollout + +**API & SDK polish** + +- [ ] OpenAPI tags: “Automodel Training Jobs”. +- [ ] List/get/delete/results routes via `add_job_routes` defaults under `/v2/workspaces/{workspace}/automodel/jobs`. +- [x] SDK hub: `client.customization.automodel.jobs.create(workspace=..., spec=...)` — paths under `/v2/workspaces/{workspace}/automodel/jobs`; **no** silent global namespace default (document `workspace="default"` for local dev). +- [ ] Error mapping: `PlatformJobCompilationError` / `validate_for_training` → 422; `check_dataset_access` / model entity auth failures → 403 or 422 with clear copy. +- [ ] Migration guide field table: legacy flat `training` + single `dataset` string → `AutomodelJobInput` sections; `output_model` → `output`. + +**Docs & deploy** + +- [ ] Automodel plugin README: install, enabled-plugins, CLI examples, sample `job.json`. +- [ ] Config reference: `NMP_AUTOMODEL_*` (training/tasks image overrides, resource defaults); link `NMP_JOBS_ENABLE_SUBPROCESS_EXECUTOR` / [Step 0](#step-0--design-lock--platform-prerequisites). +- [ ] Migration guide: `CustomizationJob` / `CustomizationJobInput` → `AutomodelJobInput` field mapping. +- [ ] Helm/assets: deploy `nmp/automodel-training` + `nmp/automodel-tasks` (replace `customizer-automodel` on product cutover — Studio migration still out of scope). + +--- + +### Success criteria (exit checks) + +- [ ] `nemo customization automodel jobs submit job.json -w acme-corp` → `/apis/customization/v2/workspaces/acme-corp/automodel/jobs`; fails fast without Docker/GPU. +- [ ] `POST` accepts `AutomodelJobInput`; GET returns enriched `AutomodelJobOutput` in `acme-corp`. +- [ ] Completed job: **Model** entity + fileset + adapter metadata in same workspace. +- [ ] Training progress on Jobs task `status_details.metrics`. +- [ ] Training image CI smoke passes. +- [ ] No legacy `platform_job_config_compiler` / multi-backend customizer dependency. +- [ ] Router test: second fake contributor merges without router code changes. + +--- + +## Goals (from requirements) + +| Requirement | Intent | +|-------------|--------| +| **First-class CLI** | Submit/run jobs from a **simplified JSON** job config (not the full CustomizationJob API surface). Pattern: Data Designer’s `[CONFIG_SOURCE]` → canonical spec (`plugins/nemo-data-designer/.../cli/inputs.py`). | +| **Fail if Docker disabled for jobs** | Automodel training is GPU + container-only. Reject compile/submit when `platform.runtime` is not `docker` or Docker daemon/GPUs are unavailable (stricter than today’s “warn and set runtime NONE”). Independent of `jobs.enable_subprocess_executor`. | +| **First-class API** | Workspace-scoped REST under `/apis/customization/v2/workspaces/{workspace}/automodel/...` — `{workspace}` is a **required path segment** on every job route (create, list, get, delete, results). Served via the **customization router** (single `/apis/customization` mount); Automodel is the first contributor. | +| **Customization router** | **`nemo-customizer-plugin`** owns `/apis/customization` and merges HTTP/CLI/SDK from contributors (Automodel v1; RL / Megatron / Unsloth later) — no monolithic `nmp-customizer`, no per-backend top-level `/apis/*` services. | +| **Automodel-only job path** | No NeMo RL, Megatron-Bridge, DPO, GRPO, or multi-backend dispatch. Single compiler → single training step image. | +| **Internal callback API** | Keep task-level progress updates to the Jobs service (`sdk.jobs.tasks.create_or_update`) from training subprocesses — not a public user API. | +| **Simplified training image** | New image derived from `nemo-automodel` NGC base with only platform task glue + `nemo_automodel`, not full `nmp-customizer` / RL / Megatron stack. | +| **Entity lifecycle** | Jobs still: download artifacts → train → upload checkpoint → **create/update Model entity** (and LoRA adapter metadata where applicable). | +| **Jobs API parity** | `service_name="customization"` on `add_job_routes`; auto `automodel-{id}` names; `training.execution_profile` in spec; CORE routes only. | + +--- + +## Platform jobs: `runtime` vs step executors + +Two layers are easy to conflate; this plugin only cares about the second for **training steps**, but operators need both clear in config and docs. + +| Layer | Config | Cardinality | Meaning | +|-------|--------|-------------|---------| +| **Platform deployment** | `platform.runtime` | **One value** per process (`docker` \| `kubernetes` \| `none`) | How the platform orchestrates container workloads (Docker daemon vs K8s vs neither). **Not** “how every job step runs.” | +| **Job step execution** | `platform_spec.steps[].executor` | **Per step** | Backend for that step: `cpu`/`gpu` + container → Docker or K8s; `subprocess` → host process (local dev / lightweight tasks). | + +Today, when `platform.runtime: docker`, the Jobs service **implicitly** also registers `subprocess/default` (host execution) alongside `cpu/default` and `gpu/default` (Docker). That coupling is what makes `runtime: docker` sound like “everything runs in Docker.” + +### Proposed: `jobs.enable_subprocess_executor` + +Make host subprocess execution an **explicit** platform choice instead of a side effect of `runtime: docker`. + +| Field | Type | Default | Behavior | +|-------|------|---------|----------| +| `jobs.enable_subprocess_executor` | `bool` | `true` when `platform.runtime == docker` (local dev); **`false` on Kubernetes** unless explicitly set `true` | When `true`, register `subprocess/default` and allow steps with `provider: subprocess`. When `false`, omit subprocess from default profiles; CPU/GPU container steps use Docker (or K8s) only. Dev clusters may opt in explicitly; production K8s should leave host execution disabled. | + +**Implementation:** [Step 0 — Platform Jobs flag](#step-0--design-lock--platform-prerequisites) (cross-cutting, not Automodel-only). + +**Automodel plugin implications:** + +- Training steps are **always** `cpu`/`gpu` + container → Docker; Automodel does **not** depend on `enable_subprocess_executor`. +- Compile gate ([Step 4](#step-4--nemo-automodel-plugin-contributor--job)): **`platform.runtime == docker`** + daemon + GPUs — not “subprocess enabled.” +- Optional `jobs run` ([Step 4](#step-4--nemo-automodel-plugin-contributor--job) 2b): subprocess only if the flag is enabled. +- Prefer error copy: *“Automodel training requires `platform.runtime: docker` with GPU-backed container execution”* — avoid *“Docker job runtime”* without qualification. + +**Example local config (explicit):** + +```yaml +platform: + runtime: docker + +jobs: + enable_subprocess_executor: true # host steps for dev; training still uses cpu/gpu + container + executor_defaults: + docker: + launcher_tool_path: ./services/core/jobs/jobs-launcher/jobs-launcher + subprocess: + working_directory: /tmp/nmp-subprocess-jobs +``` + +Production / GPU-only deployments can set `enable_subprocess_executor: false` to avoid registering host execution while keeping `runtime: docker` for Automodel and other container jobs. + +--- + +## Current state (reference) + +### Legacy Customizer (`Platform/services/customizer/`) + +- **API**: `CustomizationJobInput` / `CustomizationJobOutput` via `job_route_factory` (`api/v2/jobs/endpoints.py`). +- **Compiler**: `platform_job_config_compiler` builds a **4-step** `PlatformJobSpec`: + 1. `nmp.customizer.tasks.file_io` (CPU) — download model + datasets + 2. Training (GPU) — backend selected in training compiler (`automodel` \| `nemo_rl` \| `megatron_bridge`) + 3. `file_io` upload + 4. `nmp.customizer.tasks.model_entity` — register model in Models service +- **Automodel backend**: `tasks/training/backends/automodel/` — `compile_automodel_config()`, `AutomodelBackend`, `finetune.py` (wraps `nemo_automodel` recipes + `TrainingProgressCallback`). +- **Image**: `customizer-automodel` (see `nmp/docker/Dockerfile.nmp-customizer`); contract tests in `Platform/tests/customizer-automodel-contract/`. +- **Progress “callbacks”**: `JobsServiceProgressReporter` + `TrainingProgressCallback` call Jobs internal task API (rank-0 only). + +### Platform plugin patterns (`Platform/plugins/`) + +- Entry points: training plugins use `nemo.customization.contributors`; **`nemo-customizer-plugin`** uses `nemo.services` + `nemo.cli` key `customization`; jobs via `nemo.jobs` (`customization..`). +- Jobs: `NemoJob` + `add_job_routes()` (`nemo_platform_plugin.jobs.routes`). +- Reference plugins: `nemo-evaluator` (service + job scaffold), `nemo-data-designer` (CLI config file → spec), `nemo-agents` (service + multiple routers). + +### Simplified config shape (already validated) + +Contract input JSONs under `Platform/tests/customizer-automodel-contract/input_configs/` are a good starting point for the **CLI/API simplified spec** (e.g. `llama_3_2_1b_lora.json`): `model`, `dataset`, `training`, `schedule`, `batch`, `optimizer`, `parallelism`, `output_model`, optional `seed`. + +--- + +## Target architecture + +```mermaid +flowchart TB + subgraph surfaces [Plugin surfaces] + CLI["nemo customization automodel jobs submit -w WS job.json"] + API["POST .../v2/workspaces/WS/automodel/jobs"] + SDK["client...jobs.create(workspace=WS)"] + end + + subgraph router [nemo-customizer] + CUST["CustomizationRouterService"] + MERGE["merge contributors"] + end + + subgraph plugin [plugins/nemo-automodel] + CONTrib["AutomodelContributor"] + JOB["AutomodelJob\n(NemoJob.compile)"] + CLI_MOD["automodel CLI subgroup"] + end + + subgraph pkg [services/automodel — library only, no HTTP server] + CORE["compile_spec / validate"] + TASK_TRAIN["tasks/training\n(automodel only)"] + TASK_IO["tasks/file_io"] + TASK_ME["tasks/model_entity"] + end + + subgraph deploy [platform.runtime docker] + JOBS["Jobs service"] + DOCKER["cpu/gpu steps → Docker"] + MODELS["Models service"] + FILES["Files service"] + end + + CLI --> CUST + API --> CUST + SDK --> CUST + CUST --> MERGE --> CONTrib + CONTrib --> JOB + CONTrib --> CLI_MOD + CONTrib --> CORE + JOB -->|compile PlatformJobSpec| JOBS + JOBS --> DOCKER + DOCKER --> TASK_IO + DOCKER --> TASK_TRAIN + DOCKER --> TASK_ME + TASK_TRAIN -->|internal tasks API| JOBS + TASK_ME --> MODELS + TASK_IO --> FILES +``` + +### Package layout (proposed) + +``` +Platform/ + plugins/nemo-customizer/ # router + contributor protocol (v1) + pyproject.toml + src/nemo_customizer/ + router.py # CustomizationRouterService (nemo.services → customization) + cli.py # CustomizationCLI + contributor.py # re-export CustomizationContributor from nemo_platform_plugin + discovery.py # re-export discover_customization_contributors + docs/CUSTOMIZATION.md # contributor author guide + + plugins/nemo-automodel/ + SCOPE.md # this file + pyproject.toml + src/nemo_automodel_plugin/ + contributor.py # AutomodelContributor (routers + CLI subgroup) + cli.py + cli/inputs.py # JSON config → spec + config.py # NemoConfig (image names, defaults) + schema.py # AutomodelJobInput, AutomodelJobOutput, sub-models + jobs/ + jobs.py # AutomodelJob (compile + optional local run) + sdk/ # optional hub resources + docs/ + + services/automodel/ # Python package nmp-automodel — tasks/compiler only (no HTTP server) + pyproject.toml + src/nmp/automodel/ + config.py + platform_client.py # model entity fetch (from customizer) + app/jobs/ + compiler.py # Automodel-only PlatformJobSpec (4 steps, slim) + training/ + compiler.py # single GPU step + schemas.py + file_io/ # port or thin wrapper from customizer + model_entity/ # port from customizer + tasks/ + training/backends/automodel/ # port: config, backend, finetune, callbacks + file_io/ + model_entity/ + docker/ + Dockerfile.nmp-automodel-training # GPU: nmp-automodel-base + finetune + Dockerfile.nmp-automodel-tasks # CPU: file_io / model_entity glue (slimmer) + tests/ + +``` + +**Dependency rule**: + +| Package | Depends on | Provides | +|---------|------------|----------| +| **`nemo-customizer-plugin`** | `nemo-platform-plugin` | Router service/CLI; `CustomizationContributor` protocol and `discover_customization_contributors()` live in **`nemo_platform_plugin`** | +| **`nemo-automodel`** (plugin) | `nemo-platform-plugin`, `nmp-automodel` (+ `nemo-customizer-plugin` at runtime via `enabled-plugins`) | `AutomodelContributor`, schemas; Step 5 `cli/inputs.py` optional | +| **`nmp-automodel`** (service) | `nmp-common`, platform SDK types | Compilers, task entrypoints, Dockerfiles | + +Avoid pulling entire legacy `nmp-customizer`. **`nemo-platform-plugin`** holds the contributor protocol and discovery (IGW-aligned); **`nemo-customizer-plugin`** holds only the router service/CLI merge logic. + +### Customization router (in scope — v1) + +**Problem:** `discover_services()` maps `nemo.services` entry-point **keys** 1:1 to mounted apps (`/apis//...`). Only one plugin can own `customization`. A monolithic customizer is out; multiple training backends (Automodel, RL, Megatron, Unsloth) must share one URL tree without boxing future plugins into Automodel’s package. + +**Solution:** **`nemo-customizer-plugin`** ships **`CustomizationRouterService`** as the sole `nemo.services` registration for `customization`. Training plugins register as **contributors** via a new entry-point group; they do **not** register their own top-level `nemo.services` key. + +| Piece | Owner | Registration | +|-------|--------|----------------| +| `/apis/customization/...` mount | `nemo-customizer-plugin` | `nemo.services` → `customization` = `CustomizationRouterService` | +| Automodel routes | `nemo-automodel` plugin | `nemo.customization.contributors` → `automodel` = `AutomodelContributor` | +| Future RL / Megatron / Unsloth | Each backend’s plugin | Same group, distinct keys: `rl`, `megatron`, `unsloth`, … | +| Task/compiler library | `nmp-automodel` | No HTTP; imported by plugin + Jobs task images | + +**Contributor contract** (protocol in `nemo_customizer.contributor`): + +```python +class CustomizationContributor(Protocol): + """One training backend under /apis/customization.""" + + name: ClassVar[str] # must match entry-point key, e.g. "automodel" + + def get_routers(self) -> list[RouterSpec]: + """e.g. prefix v2/workspaces/{workspace}/automodel + job routes.""" + + def get_cli(self) -> typer.Typer | None: + """Subgroup mounted at `nemo customization `.""" +``` + +SDK: **`nemo-customizer-plugin`** registers `nemo.sdk` → `customization` and composes per-contributor SDK modules (e.g. `nemo_automodel_plugin.sdk.resources` → `client.customization.automodel`). +``` + +**Router behavior:** + +1. `discover_customization_contributors()` loads all `nemo.customization.contributors` entry points (fault-isolated, allowlist via `NEMO_PLUGIN_CUSTOMIZATION_CONTRIBUTORS_ALLOWLIST` or `NEMO_PLUGIN_ALLOWLIST`). +2. If **zero** contributors load, **fail startup** with a clear configuration error (router enabled but no backends). +3. `CustomizationRouterService.get_routers()` concatenates each contributor’s `RouterSpec` list (stable sort by `name`); `dependencies` = union of contributor + platform service deps (`merge_router_dependencies()` at router startup). +4. `CustomizationCLI.get_cli()` builds `typer.Typer(name="customization")` and mounts each contributor subgroup (`automodel`, …). +5. OpenAPI / SDK generation includes the merged tree under service name `customization` only. +6. **No route collision:** each contributor owns a distinct path segment after `.../workspaces/{workspace}/` (Automodel → `automodel`; legacy multi-backend `jobs` stays unmounted until a contributor revives it intentionally). + +**Automodel plugin wiring (v1):** + +| Surface | Entry point | Notes | +|---------|-------------|--------| +| HTTP | `nemo.customization.contributors.automodel` | **Not** `nemo.services` — router owns the mount | +| Jobs | `nemo.jobs` → `customization.automodel.jobs` | Unchanged | +| CLI | Via contributor `get_cli()` | `nemo customization automodel jobs ...` | +| SDK | `nemo-customizer-plugin` → `nemo.sdk:customization` composes contributor SDKs | `client.customization.automodel.jobs` | +| Tasks | `nmp-automodel` package | No server | + +**pyproject.toml (Automodel plugin):** + +```toml +[project.entry-points."nemo.customization.contributors"] +automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" + +[project.entry-points."nemo.jobs"] +"customization.automodel.jobs" = "nemo_automodel_plugin.jobs.jobs:AutomodelJob" +``` + +**`nemo-customizer-plugin` pyproject.toml:** + +```toml +[project.entry-points."nemo.services"] +customization = "nemo_customizer.router:CustomizationRouterService" + +[project.entry-points."nemo.cli"] +customization = "nemo_customizer.cli:CustomizationCLI" + +[project.entry-points."nemo.sdk"] +customization = "nemo_customizer.sdk.resources:customization_sdk_resources" +``` + +Enable in platform workspace / `enabled-plugins` alongside `nemo-automodel`. + +**Implementation checklist:** [Step 1 — `nemo-customizer-plugin`](#step-1--nemo-customizer-blocks-automodel-http). + +### URL routing (decided) + +Platform mounts the router at `/apis/customization//...`. Automodel contributor prefix: + +| Piece | Value | +|-------|--------| +| Router `NemoService.name` | `customization` | +| Contributor key | `automodel` | +| Automodel `RouterSpec.prefix` | `v2/workspaces/{workspace}/automodel` | +| Example job create | `POST /apis/customization/v2/workspaces/{workspace}/automodel/jobs` | +| Legacy (deprecated) | `POST /apis/customization/v2/workspaces/{workspace}/jobs` — **not registered** in v1 | + +**No `/train/` segment:** Flat `/jobs` under `.../automodel/` (`NemoJob.job_collection_path = "/jobs"`). + +| Job wiring | Value | +|----------|--------| +| `NemoJob.job_collection_path` | `"/jobs"` | +| `NemoJob.name` | `"jobs"` (CLI/SDK subgroup suffix only) | +| `nemo.jobs` entry key | `customization.automodel.jobs` | +| `add_job_routes(..., service_name=)` | **`"customization"`** (required; sets Jobs `source` + filters) | +| `generate_job_name` | **`generate_automodel_id`** → `automodel-{hex}` | +| `route_options` | **`[JobRouteOption.CORE]`** (no pause/resume v1) | +| `training.execution_profile` | Spec field → GPU step profile; default from `NMP_AUTOMODEL_DEFAULT_TRAINING_EXECUTION_PROFILE` | +| Request `profile` body | **Deferred** — use spec field until `BaseJobRequest` plumbing lands | + +Do **not** register `nemo.services` = `automodel` (would split the product URL tree). + +**Contributor job mount (reference):** + +```python +from nmp.common.jobs.api_factory import JobRouteOption +from nemo_platform_plugin.jobs.routes import add_job_routes + +def get_routers(self) -> list[RouterSpec]: + return [ + RouterSpec( + prefix="v2/workspaces/{workspace}/automodel", + router=add_job_routes( + AutomodelJob, + service_name="customization", + generate_job_name=generate_automodel_id, + route_options=[JobRouteOption.CORE], + default_profile=plugin_config.default_training_execution_profile, + ), + ), + ] +``` + +**CLI:** `nemo customization automodel jobs submit job.json` — router CLI + Automodel contributor subgroup. + +### Workspace scoping (required) + +All Automodel resources are scoped to a **platform workspace** (tenant/project boundary). The workspace is carried on the URL path for HTTP, on CLI/SDK calls for clients, and in job/task runtime env — it is **not** a separate top-level field in the simplified job JSON body. + +#### API routes (full pattern) + +Mount prefix: `v2/workspaces/{workspace}/automodel` → base: + +`/apis/customization/v2/workspaces/{workspace}/automodel` + +| Operation | Method | Path (after base) | +|-----------|--------|-------------------| +| Create job | `POST` | `/jobs` | +| List jobs | `GET` | `/jobs` | +| Get job | `GET` | `/jobs/{job_name}` | +| Delete job | `DELETE` | `/jobs/{job_name}` | +| Job results | `GET` | `/jobs/{job_name}/results/...` | + +Example: + +```http +POST /apis/customization/v2/workspaces/acme-corp/automodel/jobs +Content-Type: application/json + +{ "model": "llama-3-8b-base", "dataset": { ... }, ... } +``` + +`acme-corp` is the scope for: authz checks, Jobs service record, Models/Filesets entities, and compiled fileset `workspace` fields. + +#### Workspace in the job spec (body vs path) + +| Source | Role | +|--------|------| +| **Path `{workspace}`** | Authoritative scope for the job and all entities created by it (output model, output fileset, Jobs record). | +| **Spec `model`** | Model entity **name** in the path workspace, or qualified `other-workspace/model-name` for cross-workspace reads (same as legacy `CustomizationJobInput.model`). | +| **Spec `dataset`** | `{ training: "name" }` or `{ training: "workspace/name" }` — bare names resolve in the path workspace (no `fileset://` prefix). | +| **Spec `output.name`** | New or updated `ModelEntity` **in the path workspace** only. | +| **Body `workspace` field** | **Do not add** — avoids conflicting with the path param. | + +`compile(workspace, spec, ...)` and `to_spec(..., workspace=...)` receive the path workspace from `add_job_routes` / `job_route_factory` (same contract as `nemo_platform_plugin.jobs.routes`). + +#### CLI + +Auto-generated `submit` / `run` include `--workspace` / `-w` (default `"default"`). Custom wrappers must **forward** it to the framework callback: + +```bash +nemo customization automodel jobs submit job.json --workspace acme-corp +nemo customization automodel jobs submit job.json -w acme-corp +# execution profile: set training.execution_profile in job.json (request --profile body deferred) +``` + +Submit URL (see `nemo_platform_plugin.commands` job submit helper): + +`/apis/customization/v2/workspaces/{workspace}/automodel/jobs` + +i.e. `/apis/{NemoService.name}/{RouterSpec.prefix}/...` with `name=customization` and prefix `v2/workspaces/{workspace}/automodel`. + +#### SDK + +Hub resources take `workspace` on every call (pattern: evaluator `client.evaluator...`): + +```python +job = client.customization.automodel.jobs.create( + workspace="acme-corp", + spec=AutomodelJobInput(...), +) +status = client.customization.automodel.jobs.retrieve( + workspace="acme-corp", + name=job.name, +) +``` + +SDK must not default silently to a global namespace; document `workspace="default"` for local dev only. + +#### Runtime (compiled job + tasks) + +| Stage | Workspace usage | +|-------|------------------| +| **Compile** | `fetch_model_entity(spec.model, workspace, sdk)`; output fileset refs use `workspace=None` in compile JSON and are resolved at runtime to the job workspace (legacy customizer pattern). | +| **Jobs service** | Job created in path workspace. | +| **Task containers** | `NEMO_JOB_WORKSPACE` (and `JobContext.workspace` / `get_workspace()`) set from job; `model_entity` task creates entities in that workspace. | +| **Progress callbacks** | `sdk.jobs.tasks.create_or_update(..., workspace=job_ctx.workspace, job=job_ctx.job_id, ...)`. | +| **List/filter** | API list endpoints return only jobs in the path workspace. | + +#### Tests + +→ [Step 5](#step-5--cli-submit-path) (CLI `-w`), [Step 6](#step-6--tests--contract-continuity) (API, integration, workspace isolation). + +--- + +## Work breakdown + +Phases map to [Implementation order](#implementation-order). **Checklists and step-level detail live in the steps above**; sections below add design reference (Option A wiring, Studio verification, JSON spec) without duplicating deliverables. + +| Phase | Implementation step(s) | Topic | +|-------|------------------------|--------| +| 0 | [Step 0](#step-0--design-lock--platform-prerequisites) | Design lock, Jobs flag, schemas | +| 1 | [Step 1](#step-1--nemo-customizer-blocks-automodel-http) | Customization router | +| 2 | [Step 2](#step-2--nmp-automodel-package-core) | `nmp-automodel` compiler/tasks | +| 3 | [Step 3](#step-3--container-images) | Docker images | +| 4 | [Step 4](#step-4--nemo-automodel-plugin-contributor--job) | Automodel plugin + Docker gate | +| 5 | [Step 5](#step-5--cli-submit-path) | CLI | +| 6 | [Step 6](#step-6--tests--contract-continuity) | Tests | +| 7 | [Step 7](#step-7--sdk-openapi-docs--rollout) | SDK / docs / deploy | +| — | [Step 2](#step-2--nmp-automodel-package-core) (callbacks) | Internal Jobs task API (not public) | + +### Phase 0 — Design lock + +→ [Step 0](#step-0--design-lock--platform-prerequisites). Router design: [Customization router](#customization-router-in-scope--v1). + +#### Input vs canonical spec — **decided: Option A** + +On job **create**, the platform always: + +1. Validates the POST body against **`AutomodelJobInput`** (`input_spec_schema`). +2. Runs **`AutomodelJob.to_spec()`** → **`AutomodelJobOutput`** stored on the Jobs record (`spec_schema`). +3. Runs **`compile()`** on the canonical output → `platform_spec` for execution. + +Enrichment (auto output name/fileset, adapter vs model type, dataset ACL, model entity fetch) happens in step 2 — the Jobs service persists that result, not a post-compile rewrite. Rejected alternatives: single-schema POST (manual output fields), enrich-only-in-`compile()` (broken persistence), renamed input fields (unnecessary vs legacy). + +**`AutomodelJob` wiring:** + +```python +class AutomodelJobInput(BaseModel): # POST body / CLI JSON + model: str # name or workspace/name + dataset: DatasetSpec # training + optional validation fileset URIs + training: TrainingSpec # includes training_type, execution_profile, ... + output: OutputRequest | None = None # optional name only + # @model_validator: reject "output_model" key with legacy error message + +class AutomodelJobOutput(BaseModel): # stored spec + GET response shape + output: OutputResponse # required: name, fileset, type (model | adapter) + # ... enriched fields from input ... + + def validate_for_training(self) -> None: + # Port MoE / parallelism rules from CustomizationJobOutput + +class AutomodelJob(NemoJob): + name = "jobs" + job_collection_path = "/jobs" + input_spec_schema = AutomodelJobInput + spec_schema = AutomodelJobOutput + dependencies = ["entities", "auth", "jobs", "secrets", "files", "models"] + + @classmethod + async def to_spec(cls, input_spec, *, workspace, entity_client, async_sdk, is_local): + # Port transform_input_to_output + check_dataset_access per fileset + + @classmethod + async def compile(cls, *, workspace, spec: AutomodelJobOutput, ...): + spec.validate_for_training() + # nmp.automodel.app.jobs.compiler → PlatformJobSpec (4 steps) +``` + +**Implementation notes:** + +- Port source: `Platform/services/customizer/src/nmp/customizer/utils.py` (`transform_input_to_output`). +- `to_spec()` generates `output.fileset`, infers `output.type`, runs `fetch_model_entity` + `check_dataset_access`. +- `compile()` receives **`AutomodelJobOutput` only**; calls `validate_for_training()` before building `PlatformJobSpec`. +- Mount via `add_job_routes(..., service_name="customization", generate_job_name=generate_automodel_id)` — [URL routing](#url-routing-decided). +- **CLI JSON** = `AutomodelJobInput`. **`jobs explain`** exposes both schemas. + +#### Deprecation — Platform spin-up and Studio (verified) + +**Platform `AVAILABLE_SERVICES`** (`packages/nmp_platform_runner/src/nmp/platform_runner/registry.py`) does **not** include `customization` / `customizer`: + +```18:33:packages/nmp_platform_runner/src/nmp/platform_runner/registry.py +AVAILABLE_SERVICES: dict[str, str] = { + "hello-world": "nmp.hello_world.main:service", + "studio": "nmp.studio.main:service", + ... + "inference-gateway": "nmp.core.inference_gateway.main:service", +} +``` + +`API_SERVICES` and `OPENAPI_SERVICES` likewise omit customization. Plugin services are merged at runtime via `discover_services()` (e.g. future `customization` from `nemo.services`), but the **legacy `nmp.customizer` microservice is not started** by default platform spin-up in this repo. + +**Note:** The older `nmp/` tree still lists `"customization": "nmp.customizer.main:service"` in its copy of the registry — do not treat that as Platform default behavior. + +**Studio today:** + +| Signal | Status | +|--------|--------| +| `VITE_FF_CUSTOMIZER_ENABLED` | Default **`false`** (`featureFlags.ts`) | +| Routes | Gated via `CUSTOMIZER_ENABLED` / `gateRoutes` — customization pages hidden when flag off | +| Live API | Vendored hooks target `/apis/customization/v2/.../jobs`; comment states service removed and UI must not call at runtime | +| Tests | MSW handlers in `mocks/handlers/customizer.ts`; `create-a-customization.spec.tsx` is **`describe.skip`** | + +```8:9:Platform/web/packages/sdk/vendored/customizer/api.ts +// Note: these hooks call /apis/customization/v2/... endpoints that won't exist while the customizer +// service is removed. The customizer UI is feature-flagged off, so they should never be invoked at runtime. +``` + +**First PR implication:** Safe to register **`CustomizationRouterService`** plus **`AutomodelContributor`** without legacy `nmp-customizer`. Studio/SDK migration **out of scope**. + +### Phase 1 — `nmp-automodel` package core + +→ [Step 2](#step-2--nmp-automodel-package-core). Port table and deliverables are defined there. + +### Phase 2 — Plugin surfaces + +→ [Step 4](#step-4--nemo-automodel-plugin-contributor--job) + [Step 5](#step-5--cli-submit-path). Requires [Step 1](#step-1--nemo-customizer-blocks-automodel-http). + +### Phase 3 — Docker enforcement & GPU validation + +→ [Step 4](#step-4--nemo-automodel-plugin-contributor--job) (compile-time checks). Today `validate_gpu_available_for_docker` only runs when `runtime == DOCKER` and reserved GPU list is empty — extend for all Automodel jobs. + +### Phase 4 — Container images + +→ [Step 3](#step-3--container-images). + +### Phase 5 — Internal Jobs callback path + +→ [Step 2](#step-2--nmp-automodel-package-core) (not a new public route). Optional later: webhooks. + +### Phase 6 — API & SDK polish + +→ [Step 7](#step-7--sdk-openapi-docs--rollout). + +### Phase 7 — Testing & contract continuity + +→ [Step 6](#step-6--tests--contract-continuity). + +### Phase 8 — Docs & rollout + +→ [Step 7](#step-7--sdk-openapi-docs--rollout). + +--- + +## Simplified JSON spec (draft) — `AutomodelJobInput` only + +POST body and CLI JSON file use **`AutomodelJobInput`** only. After create, GET returns **`AutomodelJobOutput`** with enriched `output` (fileset, type). Validated in the context of the path **`workspace`** (or CLI `-w`). Entity names below are relative to that workspace unless qualified as `other-ws/name`. + +```json +{ + "name": "optional-job-name", + "model": "llama-3-8b-base", + "dataset": { + "training": "my-sft-train", + "validation": "my-sft-val" + }, + "training": { + "training_type": "sft | distillation", + "finetuning_type": "lora | all_weights | lora_merged", + "lora": { "rank": 16, "alpha": 32, "merge": false, "target_modules": null }, + "max_seq_length": 2048, + "execution_profile": "gpu", + "teacher_model": "meta/llama-3.2-3b-instruct", + "distillation_ratio": 0.5, + "distillation_temperature": 1.0, + "teacher_precision": "bf16", + "offload_teacher": false + }, + "schedule": { "epochs": 1, "max_steps": 50, "val_check_interval": 25, "seed": 42 }, + "batch": { "global_batch_size": 8, "micro_batch_size": 1, "sequence_packing": false }, + "optimizer": { + "learning_rate": 5e-6, + "weight_decay": 0.01, + "warmup_steps": 0 + }, + "parallelism": { + "num_nodes": 1, + "num_gpus_per_node": 1, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "context_parallel_size": 1 + }, + "output": { "name": "my-finetuned-model", "description": "optional" }, + "integrations": { + "wandb": { "enabled": true, "project": "my-project", "api_key_secret": "wandb-api-key" }, + "mlflow": null + } +} +``` + +**Validation rules:** + +- **`output_model` is rejected** at parse time (legacy: *"spec.output_model was removed. Use spec.output instead."*). +- `teacher_model`, `distillation_*`, and `offload_teacher` only when `training_type` is `distillation` (omit for `sft`). +- Optional `dataset.prompt_template` for non-chat prompt/completion data (chat datasets use tokenizer chat template — document in README). +- Compiler may accept additional optimizer/parallelism fields required by contract JSONs even if omitted from this minimal example (`adam_beta1`, `expert_parallel_size`, …). + +**Training types in v1:** + +| `training_type` | Automodel recipe | Notes | +|-----------------|------------------|-------| +| `sft` | `TrainFinetuneRecipeForNextTokenPrediction` | Default; LoRA / all_weights / lora_merged | +| `distillation` | `KnowledgeDistillationRecipeForNextTokenPrediction` | Requires `teacher_model`; maps to Automodel `teacher_model`, `kd_ratio`, `kd_loss_fn` ([`nemo_automodel/recipes/llm/kd.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/llm/kd.py), example [`examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_kd/llama3_2/llama3_2_1b_kd.yaml)) | + +**KD / distillation fields** (when `training_type: distillation`): mirror legacy Customizer API — `teacher_model` (entity ref in path workspace), `distillation_ratio` (→ `kd_ratio`, default `0.5`), `distillation_temperature` (→ `kd_loss_fn.temperature`, default `1.0`), `teacher_precision` (default `bf16`), optional `offload_teacher` (→ `offload_teacher_model`). Compiler port: `_configure_kd()` in legacy `automodel/config.py`. Validate tokenizer compatibility student/teacher before submit. + +**Explicitly out of scope (v1):** DPO, GRPO, `nemo_rl`, `megatron_bridge`, quantized LoRA, DoRA, **embedding-model SFT** (`embed_1b` / biencoder recipe), **`deployment_config`** (post-train NIM deploy), request-body **`profile`** on job create (use `training.execution_profile`). + +**Compiler responsibilities** (unchanged from legacy): + +1. Resolve `model` → `ModelEntity` in **path workspace** (or explicit `ws/name` ref). +2. Resolve dataset filesets in **path workspace** → local paths in download step. +3. `compile_automodel_config()` → YAML/JSON for `finetune.py`. +4. Generate output fileset + `ModelEntityTaskConfig` with `workspace` = path workspace (output model and fileset live in that workspace). + +--- + +## Risk & complexity notes + +| Topic | Note | +|-------|------| +| **Largest port** | `compile_automodel_config()` (~800 LOC) and `validate_for_training()` (MoE/parallelism). `deployment_config` and embedding SFT are **out of scope v1**. | +| **Shared code** | File I/O and model_entity tasks are backend-agnostic — candidate for `nmp-common` or small `nmp-training-tasks` lib later; v1 can duplicate to ship faster. | +| **Python version** | NGC automodel uses 3.12; platform pins 3.11 for API — task image runs 3.12 (existing customizer pattern). | +| **KD / distillation** | In v1 JSON as `training_type: distillation`; compiler maps to Automodel KD recipe (see simplified JSON section). | +| **Customizer service** | Remains in repo but unused; avoid dual registration in `NMP_SERVICES`. | +| **Studio cutover** | **Out of scope** — no feature flag or Studio migration in Automodel v1; `VITE_FF_CUSTOMIZER_ENABLED` stays off. | +| **Customization router** | v1 in scope: **`nemo-customizer-plugin`** (`CustomizationRouterService` + contributor protocol); Automodel first contributor; RL/Megatron/Unsloth add contributors later without new `/apis/*` services. | +| **`runtime` vs subprocess flag** | `platform.runtime: docker` enables Docker-backed job profiles; `jobs.enable_subprocess_executor` separately controls host subprocess. Automodel training requires the former, not the latter. | + +--- + +## Non-critical follow-ups (post-v1) + +Merged into [Implementation order](#implementation-order) and [Decisions](#decisions-resolved). Remaining items are not blocking the first PR: + +| Topic | Notes | +|-------|--------| +| **`nemo.customization.contributors` in `_ALL_SURFACE_GROUPS`** | **Done:** `nemo_platform_plugin.discovery` — manifests + `discover_customization_contributors()` (IGW-aligned). | +| **Request-body `profile` on job create** | Platform follow-up MR on `BaseJobRequest` + `add_job_routes`; until then CLI `--profile` may only map to `training.execution_profile` in JSON. | +| **`custom_fields` passthrough** | Factory already supports; document if customers rely on it. | +| **Full optimizer / MoE parallelism in public JSON** | Compiler + contracts may need fields beyond the minimal example; expand OpenAPI as contract port discovers gaps. | +| **Chat dataset contract tests** | Port `*_full_sft_chat.json` when `prompt_template` behavior is documented. | + +--- + +## Decisions (resolved) + +| # | Topic | Decision | +|---|--------|----------| +| 1 | **Service vs plugin-only** | **No standalone `nmp-automodel` HTTP server.** Automodel HTTP lives on **`AutomodelContributor`** merged by **`nemo-customizer-plugin`** (`CustomizationRouterService`) at `/apis/customization`. `nmp-automodel` is compiler + tasks only. | +| 2 | **KD / distillation** | **Include in v1** simplified JSON when `training_type: distillation`. Map to [Automodel KD recipe](https://github.com/NVIDIA-NeMo/Automodel/tree/main/nemo_automodel/recipes/llm/kd.py) (`teacher_model`, `kd_ratio`, `kd_loss_fn`, optional `offload_teacher_model`). Port legacy `_configure_kd()` / `DistillationConfig` from customizer automodel backend. | +| 3 | **Image naming** | **`nvcr.io/0921617854601259/nemo-platform-dev/nmp/automodel-training`** (GPU) and **`.../nmp/automodel-tasks`** (CPU). Do **not** reuse `customizer-automodel` or the upstream `nvcr.io/nvidia/nemo-automodel` image name. | +| 4 | **Workspace package name** | **`nmp-automodel`** (PyPI upstream library remains `nemo-automodel` / NGC image name unchanged). | +| 5 | **Studio cutover** | **Punted** — no Studio feature flag or migration to `.../automodel/...` in this scope. | +| 6 | **`customization` owner** | **In scope v1:** dedicated **`nemo-customizer-plugin`** owns `nemo.services` key `customization`; backends register via **`nemo.customization.contributors`** (Automodel first; RL / Megatron / Unsloth later). `nemo-automodel` must **not** register `nemo.services` directly. | +| 7 | **`enable_subprocess_executor` on K8s** | **Default `false` on Kubernetes**; explicit `true` only when dev clusters need host subprocess. Default `true` for `platform.runtime: docker` local dev. | +| 8 | **Jobs `source` / naming** | **`service_name="customization"`** on `add_job_routes` (never default `nemo-automodel-plugin`). Auto names: **`automodel-{hex}`** via `generate_automodel_id`. | +| 9 | **`execution_profile` v1** | In **`training.execution_profile`** on job spec; default from **`NMP_AUTOMODEL_DEFAULT_TRAINING_EXECUTION_PROFILE`**. Request-body `profile` on create — **deferred** (platform gap). | +| 10 | **Embedding SFT** | **Out of scope v1** (causal LM + KD only); `embed_1b` contracts gated in Step 6 until product expands. | +| 11 | **`deployment_config`** | **Out of scope v1** (post-train NIM deploy; Studio-adjacent). | +| 12 | **Router zero contributors** | **Fail startup** if customization plugin is enabled but no `nemo.customization.contributors` load. | + diff --git a/plugins/nemo-automodel/pyproject.toml b/plugins/nemo-automodel/pyproject.toml new file mode 100644 index 00000000..126816f9 --- /dev/null +++ b/plugins/nemo-automodel/pyproject.toml @@ -0,0 +1,51 @@ +[project] +name = "nemo-automodel-plugin" +version = "0.1.0" +description = "NeMo Automodel customization contributor for NeMo Platform." +readme = "README.md" +requires-python = ">=3.11,<3.14" +dependencies = [ + "nemo-platform-plugin", + "nemo-platform", + "nmp-automodel", + "pydantic>=2.10.6", + "pydantic-settings>=2.6.1", + "typer>=0.12.5", +] + +[project.entry-points."nemo.customization.contributors"] +automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" + +[project.entry-points."nemo.jobs"] +"customization.automodel.jobs" = "nemo_automodel_plugin.jobs.jobs:AutomodelJob" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/nemo_automodel_plugin"] + +[tool.uv.sources] +nemo-platform-plugin = { workspace = true } +nemo-platform = { workspace = true } +nmp-automodel = { workspace = true } +nemo-customizer-plugin = { workspace = true } + +[dependency-groups] +dev = [ + "pytest>=8.3.4", + "pytest-asyncio>=0.25.3", + "ruff>=0.11.8", + "fastapi>=0.115.0", + "httpx>=0.27.0", + "nemo-customizer-plugin", +] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +pythonpath = ["src"] +testpaths = ["tests"] + +[tool.pyright] +extraPaths = ["src"] diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/__init__.py new file mode 100644 index 00000000..7cca7f91 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""NeMo Automodel customization contributor.""" diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/__init__.py new file mode 100644 index 00000000..4b22fee8 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/__init__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel contributor CLI helpers.""" + +from nemo_automodel_plugin.cli.inputs import apply_automodel_job_cli_overrides, load_job_json +from nemo_automodel_plugin.cli.main import AutomodelContributorCLI + +__all__ = ["AutomodelContributorCLI", "apply_automodel_job_cli_overrides", "load_job_json"] diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/inputs.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/inputs.py new file mode 100644 index 00000000..1dcd6e7c --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/inputs.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""CLI overrides: submit/run accept a job JSON file instead of ``--spec``.""" + +from __future__ import annotations + +import json +from collections.abc import Callable +from pathlib import Path + +import typer + +from nemo_automodel_plugin.schema import AutomodelJobInput + +_JOB_JSON_HELP = "Path to Automodel job JSON (AutomodelJobInput schema)." + + +def load_job_json(path: Path) -> str: + """Load and validate job JSON; return canonical JSON string for ``--spec``.""" + data = json.loads(path.read_text()) + validated = AutomodelJobInput.model_validate(data) + return validated.model_dump_json() + + +def apply_automodel_job_cli_overrides(group: typer.Typer) -> None: + """Flat ``automodel`` CLI: ``submit JOB.json``; ``run`` is disabled.""" + _replace_job_run_disabled(group) + _replace_job_submit(group) + + +def _pluck_callback(group: typer.Typer, verb: str) -> Callable[..., None]: + callback = next(c for c in group.registered_commands if c.name == verb).callback + if callback is None: + raise RuntimeError(f"missing {verb!r} callback to override") + return callback + + +def _drop_command(group: typer.Typer, name: str) -> None: + group.registered_commands = [c for c in group.registered_commands if c.name != name] + + +def _replace_job_run_disabled(group: typer.Typer) -> None: + _drop_command(group, "run") + + @group.command("run") + def run( + _typer_ctx: typer.Context, + _job_json: Path | None = typer.Argument( + None, + metavar="JOB_JSON", + help=_JOB_JSON_HELP, + ), + ) -> None: + typer.secho( + "Automodel does not support local run. Submit to the platform API instead:\n" + " nemo customization automodel submit -w ", + err=True, + fg=typer.colors.RED, + ) + raise typer.Exit(code=1) + + +def _replace_job_submit(group: typer.Typer) -> None: + original = _pluck_callback(group, "submit") + + @group.command("submit") + def submit( + typer_ctx: typer.Context, + job_json: Path = typer.Argument(..., metavar="JOB_JSON", help=_JOB_JSON_HELP), + workspace: str = typer.Option("default", "--workspace", "-w", help="Target workspace."), + profile: str | None = typer.Option(None, "--profile"), + cluster: str | None = typer.Option(None, "--cluster"), + base_url: str | None = typer.Option( + None, + "--base-url", + help=( + "Override platform API host. If omitted: --cluster, then CLI context, " + "then $NMP_BASE_URL, then http://localhost:8080." + ), + ), + options: list[str] = typer.Option([], "-o", help="Backend option override, 'backend.key=value'."), + options_file: Path | None = typer.Option(None, "--options-file"), + ) -> None: + spec_json = load_job_json(job_json) + original( + typer_ctx, + spec=spec_json, + spec_file=None, + options=options, + options_file=options_file, + profile=profile, + cluster=cluster, + base_url=base_url, + workspace=workspace, + config=None, + config_file=None, + ) diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/main.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/main.py new file mode 100644 index 00000000..e0d2072d --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/cli/main.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""CLI hooks for the Automodel customization contributor.""" + +from __future__ import annotations + +import typer +from nemo_platform_plugin.job import NemoJob + +from nemo_automodel_plugin.cli.inputs import apply_automodel_job_cli_overrides +from nemo_automodel_plugin.jobs.jobs import AutomodelJob + + +class AutomodelContributorCLI: + """Passed to ``add_job_commands`` to override job submit/run with job-file args.""" + + def update_job_cli(self, job_cls: type[NemoJob], group: typer.Typer) -> None: + if job_cls is AutomodelJob: + apply_automodel_job_cli_overrides(group) diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/config.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/config.py new file mode 100644 index 00000000..7ac7a09e --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/config.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Plugin configuration for Automodel training.""" + +from __future__ import annotations + +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class AutomodelPluginConfig(BaseSettings): + """Environment-driven Automodel plugin settings.""" + + model_config = SettingsConfigDict(env_prefix="NMP_AUTOMODEL_", extra="ignore") + + default_training_execution_profile: str = "gpu" + training_image: str = "nmp-automodel-training" + tasks_image: str = "nmp-automodel-tasks" + + +def get_config() -> AutomodelPluginConfig: + return AutomodelPluginConfig() + + +def generate_automodel_id() -> str: + """Generate a job name when the submitter omits ``name``.""" + import uuid + + return f"automodel-{uuid.uuid4().hex[:12]}" diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/contributor.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/contributor.py new file mode 100644 index 00000000..c8e70a9b --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/contributor.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel customization contributor.""" + +from __future__ import annotations + +from typing import ClassVar + +import typer +from fastapi import APIRouter +from nemo_platform_plugin.authz import AuthzContribution, authz_for_workspace_job_collection +from nemo_platform_plugin.jobs.routes import add_job_routes +from nemo_platform_plugin.service import RouterSpec +from nmp.common.jobs.api_factory import JobRouteOption + +from nemo_automodel_plugin.config import generate_automodel_id, get_config +from nemo_automodel_plugin.jobs.jobs import AutomodelJob + + +class AutomodelContributor: + """Registers Automodel routes under the customization router.""" + + name: ClassVar[str] = "automodel" + dependencies: ClassVar[list[str]] = ["entities", "auth", "jobs", "secrets", "files", "models"] + + def get_routers(self) -> list[RouterSpec]: + config = get_config() + router = APIRouter() + + @router.get("/healthz") + async def healthz() -> dict[str, str]: + return {"backend": self.name, "status": "ok"} + + jobs_router = add_job_routes( + AutomodelJob, + service_name="customization", + generate_job_name=generate_automodel_id, + route_options=[JobRouteOption.CORE], + default_profile=config.default_training_execution_profile, + ) + + return [ + RouterSpec( + router=router, + prefix="/v2/workspaces/{workspace}/automodel", + tag="Automodel", + description="Automodel contributor health.", + ), + RouterSpec( + router=jobs_router, + prefix="/v2/workspaces/{workspace}", + tag="Automodel Jobs", + description="Automodel training jobs.", + ), + ] + + def get_cli(self) -> typer.Typer: + from nemo_platform_plugin.commands import ( + _add_explain_command, + _add_run_command, + _add_submit_command, + ) + from nemo_platform_plugin.scheduler import NemoJobScheduler + + from nemo_automodel_plugin.cli.inputs import apply_automodel_job_cli_overrides + + app = typer.Typer( + name=self.name, + help="Automodel training jobs (SFT, distillation).", + no_args_is_help=True, + ) + scheduler = NemoJobScheduler() + _add_run_command(app, AutomodelJob, scheduler) + _add_submit_command(app, AutomodelJob, scheduler) + _add_explain_command(app, AutomodelJob, scheduler) + apply_automodel_job_cli_overrides(app) + return app + + def get_authz_contribution(self) -> AuthzContribution: + """Register automodel job routes with the platform authorization policy.""" + return authz_for_workspace_job_collection( + api_area="customization", + collection_suffix="/automodel/jobs", + permission_prefix="customization.automodel.jobs", + include_healthz=True, + healthz_suffix="/automodel/healthz", + ) diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/__init__.py new file mode 100644 index 00000000..e5725ea5 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/jobs.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/jobs.py new file mode 100644 index 00000000..e23e7c47 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/jobs/jobs.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel training job (NemoJob).""" + +from __future__ import annotations + +from typing import ClassVar, cast + +from nemo_platform import AsyncNeMoPlatform +from nemo_platform_plugin.config import NemoPlatformConfig, Runtime +from nemo_platform_plugin.job import NemoJob +from nemo_platform_plugin.jobs.api_factory import PlatformJobSpec +from nemo_platform_plugin.jobs.docker import validate_gpu_available_for_docker +from nemo_platform_plugin.jobs.exceptions import PlatformJobCompilationError +from nmp.automodel.compile import platform_job_config_compiler +from pydantic import BaseModel + +from nemo_automodel_plugin.config import get_config +from nemo_automodel_plugin.schema import AutomodelJobInput, AutomodelJobOutput +from nemo_automodel_plugin.transform import transform_input_to_output + + +def _require_docker_runtime() -> None: + platform_config = NemoPlatformConfig.get() + if platform_config.runtime != Runtime.DOCKER: + raise PlatformJobCompilationError( + "Automodel training requires platform.runtime: docker with GPU-backed container execution.", + ) + from nemo_platform_plugin.config import validate_docker_available + + if not validate_docker_available(): + raise PlatformJobCompilationError( + "Automodel training requires a reachable Docker daemon (platform.runtime: docker).", + ) + + +class AutomodelJob(NemoJob): + """GPU Automodel fine-tuning job under the customization router.""" + + name: ClassVar[str] = "automodel.jobs" + description: ClassVar[str] = "Automodel SFT and knowledge-distillation training jobs." + job_collection_path: ClassVar[str | None] = "/automodel/jobs" + input_spec_schema: ClassVar[type[BaseModel] | None] = AutomodelJobInput + spec_schema: ClassVar[type[BaseModel] | None] = AutomodelJobOutput + dependencies: ClassVar[list[str]] = ["entities", "auth", "jobs", "secrets", "files", "models"] + + @classmethod + async def to_spec( + cls, + input_spec: BaseModel, + workspace: str, + entity_client: object, + async_sdk: object, + is_local: bool, + ) -> AutomodelJobOutput: + job_input = ( + input_spec + if isinstance(input_spec, AutomodelJobInput) + else AutomodelJobInput.model_validate(input_spec.model_dump()) + ) + return await transform_input_to_output(job_input, workspace, cast(AsyncNeMoPlatform, async_sdk)) + + @classmethod + async def compile( + cls, + workspace: str, + spec: BaseModel, + entity_client: object, + job_name: str | None, + async_sdk: object, + profile: str | None = None, + options: dict | None = None, + ) -> PlatformJobSpec: + _require_docker_runtime() + canonical = ( + spec if isinstance(spec, AutomodelJobOutput) else AutomodelJobOutput.model_validate(spec.model_dump()) + ) + canonical.validate_for_training() + + plugin_config = get_config() + execution_profile = ( + canonical.training.execution_profile or profile or plugin_config.default_training_execution_profile + ) + + platform_spec = await platform_job_config_compiler( + canonical, + workspace, + cast(AsyncNeMoPlatform, async_sdk), + job_name=job_name, + profile=execution_profile, + ) + + validate_gpu_available_for_docker(platform_spec) + return platform_spec diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py new file mode 100644 index 00000000..30a9ef58 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py @@ -0,0 +1,240 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel job input/output schemas (simplified JSON v1).""" + +from __future__ import annotations + +from typing import Any, Literal, Self + +from pydantic import BaseModel, ConfigDict, Field, ValidationError, model_validator + + +class LoRAParams(BaseModel): + model_config = ConfigDict(extra="forbid") + + rank: int = Field(default=16, gt=0) + alpha: int = Field(default=32, gt=0) + merge: bool = False + target_modules: list[str] | None = None + + +class DatasetSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + training: str = Field(description="Training fileset as 'name' or 'workspace/name'.") + validation: str | None = None + prompt_template: str | None = None + + +class TrainingSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + training_type: Literal["sft", "distillation"] = "sft" + finetuning_type: Literal["lora", "all_weights", "lora_merged"] = "lora" + lora: LoRAParams | None = None + max_seq_length: int = Field(default=2048, gt=0) + execution_profile: str | None = Field(default=None, min_length=1) + teacher_model: str | None = None + distillation_ratio: float = Field(default=0.5, ge=0.0, le=1.0) + distillation_temperature: float = Field(default=1.0, gt=0.0) + teacher_precision: Literal["bf16", "fp16", "fp32"] = "bf16" + offload_teacher: bool = False + + @model_validator(mode="after") + def _training_type_fields(self) -> Self: + if self.training_type == "distillation" and not self.teacher_model: + raise ValueError("teacher_model is required when training_type is distillation") + if self.finetuning_type.startswith("lora") and self.lora is None: + self.lora = LoRAParams() + return self + + +class ScheduleSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + epochs: int = Field(default=1, gt=0) + max_steps: int | None = Field(default=None, gt=0) + val_check_interval: float | None = None + seed: int | None = None + + +class BatchSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + global_batch_size: int = Field(default=8, gt=0) + micro_batch_size: int = Field(default=1, gt=0) + sequence_packing: bool = False + + +class OptimizerSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + learning_rate: float = Field(default=5e-6, gt=0.0) + weight_decay: float = Field(default=0.01, ge=0.0) + warmup_steps: int = Field(default=0, ge=0) + + +class ParallelismSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + num_nodes: int = Field(default=1, gt=0) + num_gpus_per_node: int = Field(default=1, gt=0) + tensor_parallel_size: int = Field(default=1, gt=0) + pipeline_parallel_size: int = Field(default=1, gt=0) + context_parallel_size: int = Field(default=1, gt=0) + expert_parallel_size: int | None = Field(default=None, gt=0) + + +class OutputRequest(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: str + description: str | None = None + + +class OutputResponse(BaseModel): + model_config = ConfigDict(extra="forbid") + + name: str + type: Literal["model", "adapter"] + fileset: str + description: str | None = None + + +class WandbIntegration(BaseModel): + model_config = ConfigDict(extra="forbid") + + enabled: bool = True + project: str | None = None + api_key_secret: str | None = None + + +class IntegrationsSpec(BaseModel): + model_config = ConfigDict(extra="forbid") + + wandb: WandbIntegration | None = None + mlflow: dict[str, Any] | None = None + + +class AutomodelJobInput(BaseModel): + """POST body / CLI JSON.""" + + model_config = ConfigDict(extra="forbid") + + name: str | None = None + model: str + dataset: DatasetSpec + training: TrainingSpec + schedule: ScheduleSpec = Field(default_factory=ScheduleSpec) + batch: BatchSpec = Field(default_factory=BatchSpec) + optimizer: OptimizerSpec = Field(default_factory=OptimizerSpec) + parallelism: ParallelismSpec = Field(default_factory=ParallelismSpec) + output: OutputRequest | None = None + integrations: IntegrationsSpec | None = None + + @model_validator(mode="before") + @classmethod + def reject_legacy_fields(cls, data: object) -> object: + if isinstance(data, dict) and "output_model" in data: + raise ValueError("spec.output_model was removed. Use spec.output instead.") + return data + + +class AutomodelJobOutput(BaseModel): + """Stored canonical spec after ``to_spec()``.""" + + model_config = ConfigDict(extra="forbid") + + name: str | None = None + model: str + dataset: DatasetSpec + training: TrainingSpec + schedule: ScheduleSpec + batch: BatchSpec + optimizer: OptimizerSpec + parallelism: ParallelismSpec + output: OutputResponse + integrations: IntegrationsSpec | None = None + + def validate_for_training(self) -> None: + """MoE / parallelism constraints (ported from legacy CustomizationJobOutput).""" + p = self.parallelism + num_nodes = p.num_nodes + num_gpus_per_node = p.num_gpus_per_node + tp = p.tensor_parallel_size + pp = p.pipeline_parallel_size + cp = p.context_parallel_size + ep = p.expert_parallel_size + + total_gpus = num_gpus_per_node * num_nodes + model_parallel_size = tp * pp * cp + if total_gpus % model_parallel_size != 0: + raise ValidationError.from_exception_data( + "parallelism", + [ + { + "type": "value_error", + "loc": ("parallelism",), + "msg": ( + f"Total GPUs ({total_gpus}) must be divisible by " + f"tensor_parallel_size ({tp}) * pipeline_parallel_size ({pp}) * " + f"context_parallel_size ({cp}) = {model_parallel_size}" + ), + "input": p.model_dump(), + } + ], + ) + + derived_dp = total_gpus // model_parallel_size + gb = self.batch.global_batch_size + mb = self.batch.micro_batch_size + divisor = mb * derived_dp + if gb % divisor != 0: + raise ValidationError.from_exception_data( + "batch", + [ + { + "type": "value_error", + "loc": ("batch", "global_batch_size"), + "msg": ( + f"global_batch_size ({gb}) must be divisible by " + f"micro_batch_size ({mb}) * data_parallel_size ({derived_dp}) = {divisor}" + ), + "input": gb, + } + ], + ) + + if ep is not None: + dp_cp = derived_dp * cp + if dp_cp % ep != 0: + raise ValidationError.from_exception_data( + "parallelism", + [ + { + "type": "value_error", + "loc": ("parallelism", "expert_parallel_size"), + "msg": ( + f"(data_parallel_size * context_parallel_size) ({dp_cp}) " + f"must be divisible by expert_parallel_size ({ep})" + ), + "input": ep, + } + ], + ) + if ep > 1 and tp > 1 and total_gpus > 1: + raise ValidationError.from_exception_data( + "parallelism", + [ + { + "type": "value_error", + "loc": ("parallelism", "tensor_parallel_size"), + "msg": ( + f"Tensor parallelism (tensor_parallel_size={tp}) is not supported for MoE models " + f"when expert_parallel_size > 1 ({ep}); tensor_parallel_size must be 1." + ), + "input": tp, + } + ], + ) diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/__init__.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/__init__.py new file mode 100644 index 00000000..0b9ecd88 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/__init__.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel contributor SDK (mounted under ``client.customization`` by nemo-customizer).""" + +from nemo_automodel_plugin.sdk.resources import ( + AsyncAutomodelCustomization, + AsyncAutomodelJobsResource, + AutomodelCustomization, + AutomodelJobsResource, +) + +__all__ = [ + "AsyncAutomodelCustomization", + "AsyncAutomodelJobsResource", + "AutomodelCustomization", + "AutomodelJobsResource", +] diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/http_utils.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/http_utils.py new file mode 100644 index 00000000..adf4b7ff --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/http_utils.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Shared HTTP helpers for Automodel customization SDK resources.""" + +from __future__ import annotations + +from typing import Any +from urllib.parse import quote, urljoin + +from nemo_platform import AsyncNeMoPlatform, NeMoPlatform + +from nemo_automodel_plugin.schema import AutomodelJobInput + +PlatformClient = NeMoPlatform | AsyncNeMoPlatform + +_API_PREFIX = "/apis/customization" +_JOBS_COLLECTION = "v2/workspaces/{workspace}/automodel/jobs" + + +def base_url(source: str) -> str: + """Return the normalized base URL for a raw URL string.""" + return source.rstrip("/") + + +def resolve_workspace(platform: PlatformClient, workspace: str | None, strict: bool = False) -> str: + """Return the explicit, platform, or default workspace for customization routes.""" + resolved = workspace or platform.workspace + if resolved is None: + if strict: + raise ValueError("workspace must be provided when the client has no default workspace") + return "default" + return resolved + + +def url(platform: PlatformClient, path: str, workspace: str | None = None) -> str: + """Build a full customization plugin API URL for the provided route path.""" + resolved_path = path.format(workspace=quote(resolve_workspace(platform, workspace), safe="")) + return _join_url(str(platform.base_url), f"{_API_PREFIX}/{resolved_path}") + + +def jobs_collection_url(platform: PlatformClient, workspace: str | None = None) -> str: + """URL for the Automodel jobs collection in a workspace.""" + return url(platform, _JOBS_COLLECTION, workspace) + + +def job_url(platform: PlatformClient, job_name: str, workspace: str | None = None) -> str: + """URL for a single Automodel job.""" + return _join_url(jobs_collection_url(platform, workspace), quote(job_name, safe="")) + + +def platform_default_headers(platform: PlatformClient) -> dict[str, str]: + """Return string-valued default platform headers for direct HTTP calls.""" + return {str(key): value for key, value in platform.default_headers.items() if isinstance(value, str)} + + +def create_job_payload(spec: AutomodelJobInput) -> dict[str, dict[str, Any]]: + """Serialize an Automodel job creation request body.""" + return {"spec": spec.model_dump(mode="json")} + + +def _join_url(root: str, relative_path: str) -> str: + """Join a root URL and a relative path using URL parsing rules.""" + return urljoin(f"{base_url(root)}/", relative_path.lstrip("/")) diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/job_resources.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/job_resources.py new file mode 100644 index 00000000..7832f87a --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/job_resources.py @@ -0,0 +1,86 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel job resources for status polling via the customization plugin API.""" + +from __future__ import annotations + +from typing import Any +from urllib.parse import quote + +from nemo_platform_plugin.jobs.schemas import PlatformJobStatusResponse +from pydantic import BaseModel + +from nemo_automodel_plugin.sdk import http_utils + + +class AutomodelJobRecord(BaseModel): + """Minimal job record returned by the customization Automodel jobs API.""" + + name: str + workspace: str + status: str | None = None + spec: dict[str, Any] | None = None + + +class AutomodelJobResource: + """Sync handle for one submitted Automodel job.""" + + def __init__( + self, + job: AutomodelJobRecord, + http_client: Any, + base_url: str, + workspace: str, + headers: dict[str, str], + ) -> None: + self.job = job + self._http_client = http_client + self._base_url = base_url + self._workspace = workspace + self._headers = headers + + def get_status(self) -> PlatformJobStatusResponse: + """Fetch current job status.""" + response = self._http_client.get( + _job_status_path(self._base_url, self._workspace, self.job.name), + headers=self._headers, + ) + response.raise_for_status() + return PlatformJobStatusResponse.model_validate(response.json()) + + +class AsyncAutomodelJobResource: + """Async handle for one submitted Automodel job.""" + + def __init__( + self, + job: AutomodelJobRecord, + http_client: Any, + base_url: str, + workspace: str, + headers: dict[str, str], + ) -> None: + self.job = job + self._http_client = http_client + self._base_url = base_url + self._workspace = workspace + self._headers = headers + + async def get_status(self) -> PlatformJobStatusResponse: + """Fetch current job status.""" + response = await self._http_client.get( + _job_status_path(self._base_url, self._workspace, self.job.name), + headers=self._headers, + ) + response.raise_for_status() + return PlatformJobStatusResponse.model_validate(response.json()) + + +def _job_status_path(base_url: str, workspace: str, job_name: str) -> str: + encoded_workspace = quote(workspace, safe="") + encoded_job = quote(job_name, safe="") + return ( + f"{http_utils.base_url(base_url)}/apis/customization/v2/workspaces/" + f"{encoded_workspace}/automodel/jobs/{encoded_job}" + ) diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/resources.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/resources.py new file mode 100644 index 00000000..63e08a92 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/sdk/resources.py @@ -0,0 +1,164 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel contributor SDK resources (composed by ``nemo-customizer-plugin``).""" + +from __future__ import annotations + +from typing import Any + +from nemo_platform import AsyncNeMoPlatform, NeMoPlatform + +from nemo_automodel_plugin.schema import AutomodelJobInput +from nemo_automodel_plugin.sdk import http_utils +from nemo_automodel_plugin.sdk.job_resources import ( + AsyncAutomodelJobResource, + AutomodelJobRecord, + AutomodelJobResource, +) + + +class AutomodelJobsResource: + """Sync SDK namespace at ``client.customization.automodel.jobs``.""" + + def __init__(self, platform: NeMoPlatform) -> None: + self._platform = platform + self._http_client = platform._client + + def plugin_status(self) -> dict[str, object]: + """Return Automodel contributor health from the customization service.""" + response = self._http_client.get( + http_utils.url( + self._platform, + "v2/workspaces/{workspace}/automodel/healthz", + self._platform.workspace, + ), + headers=http_utils.platform_default_headers(self._platform), + ) + response.raise_for_status() + payload = response.json() + if not isinstance(payload, dict): + raise TypeError("Automodel health response must be a JSON object.") + return {str(key): value for key, value in payload.items()} + + def create( + self, + spec: AutomodelJobInput, + workspace: str | None = None, + name: str | None = None, + ) -> AutomodelJobResource: + """Submit an Automodel training job.""" + body: dict[str, Any] = http_utils.create_job_payload(spec) + if name is not None: + body["name"] = name + response = self._http_client.post( + http_utils.jobs_collection_url(self._platform, workspace), + json=body, + headers=http_utils.platform_default_headers(self._platform), + ) + response.raise_for_status() + record = AutomodelJobRecord.model_validate(response.json()) + resolved_ws = http_utils.resolve_workspace(self._platform, workspace) + return AutomodelJobResource( + job=record, + http_client=self._http_client, + base_url=http_utils.base_url(str(self._platform.base_url)), + workspace=resolved_ws, + headers=http_utils.platform_default_headers(self._platform), + ) + + def get_job_resource(self, job_name: str, workspace: str | None = None) -> AutomodelJobResource: + """Get a resource handle for an existing Automodel job.""" + resolved_ws = http_utils.resolve_workspace(self._platform, workspace) + response = self._http_client.get( + http_utils.job_url(self._platform, job_name, resolved_ws), + headers=http_utils.platform_default_headers(self._platform), + ) + response.raise_for_status() + return AutomodelJobResource( + job=AutomodelJobRecord.model_validate(response.json()), + http_client=self._http_client, + base_url=http_utils.base_url(str(self._platform.base_url)), + workspace=resolved_ws, + headers=http_utils.platform_default_headers(self._platform), + ) + + +class AsyncAutomodelJobsResource: + """Async SDK namespace at ``client.customization.automodel.jobs``.""" + + def __init__(self, platform: AsyncNeMoPlatform) -> None: + self._platform = platform + self._http_client = platform._client + + async def plugin_status(self) -> dict[str, object]: + """Return Automodel contributor health from the customization service.""" + response = await self._http_client.get( + http_utils.url( + self._platform, + "v2/workspaces/{workspace}/automodel/healthz", + self._platform.workspace, + ), + headers=http_utils.platform_default_headers(self._platform), + ) + response.raise_for_status() + payload = response.json() + if not isinstance(payload, dict): + raise TypeError("Automodel health response must be a JSON object.") + return {str(key): value for key, value in payload.items()} + + async def create( + self, + spec: AutomodelJobInput, + workspace: str | None = None, + name: str | None = None, + ) -> AsyncAutomodelJobResource: + """Submit an Automodel training job.""" + body: dict[str, Any] = http_utils.create_job_payload(spec) + if name is not None: + body["name"] = name + response = await self._http_client.post( + http_utils.jobs_collection_url(self._platform, workspace), + json=body, + headers=http_utils.platform_default_headers(self._platform), + ) + response.raise_for_status() + record = AutomodelJobRecord.model_validate(response.json()) + resolved_ws = http_utils.resolve_workspace(self._platform, workspace) + return AsyncAutomodelJobResource( + job=record, + http_client=self._http_client, + base_url=http_utils.base_url(str(self._platform.base_url)), + workspace=resolved_ws, + headers=http_utils.platform_default_headers(self._platform), + ) + + async def get_job_resource(self, job_name: str, workspace: str | None = None) -> AsyncAutomodelJobResource: + """Get a resource handle for an existing Automodel job.""" + resolved_ws = http_utils.resolve_workspace(self._platform, workspace) + response = await self._http_client.get( + http_utils.job_url(self._platform, job_name, resolved_ws), + headers=http_utils.platform_default_headers(self._platform), + ) + response.raise_for_status() + return AsyncAutomodelJobResource( + job=AutomodelJobRecord.model_validate(response.json()), + http_client=self._http_client, + base_url=http_utils.base_url(str(self._platform.base_url)), + workspace=resolved_ws, + headers=http_utils.platform_default_headers(self._platform), + ) + + +class AutomodelCustomization: + """Sync SDK namespace at ``client.customization.automodel``.""" + + def __init__(self, platform: NeMoPlatform) -> None: + self.jobs = AutomodelJobsResource(platform) + + +class AsyncAutomodelCustomization: + """Async SDK namespace at ``client.customization.automodel``.""" + + def __init__(self, platform: AsyncNeMoPlatform) -> None: + self.jobs = AsyncAutomodelJobsResource(platform) diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/transform.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/transform.py new file mode 100644 index 00000000..290fa606 --- /dev/null +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/transform.py @@ -0,0 +1,99 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Input → canonical spec transformation.""" + +from __future__ import annotations + +import uuid +from typing import TYPE_CHECKING + +from nmp.automodel.platform_client import check_dataset_access, fetch_model_entity +from nmp.common.entities.utils import parse_entity_ref + +from nemo_automodel_plugin.schema import ( + AutomodelJobInput, + AutomodelJobOutput, + OutputResponse, +) + +if TYPE_CHECKING: + from nemo_platform import AsyncNeMoPlatform + +_MAX_PREFIX_LEN = 50 +_HEX_LEN = 12 + + +def _random_suffix(prefix: str) -> str: + truncated = prefix[:_MAX_PREFIX_LEN].rstrip("-") + return f"{truncated}-{uuid.uuid4().hex[:_HEX_LEN]}" + + +def _entity_basename(model_ref: str, workspace: str) -> str: + return parse_entity_ref(model_ref, workspace).name + + +def _dataset_basename(uri: str) -> str: + normalized = uri + if normalized.startswith("fileset://"): + normalized = normalized[len("fileset://") :] + return parse_entity_ref(normalized, "default").name + + +def _infer_output_type(input_spec: AutomodelJobInput, is_embedding_model: bool) -> str: + if is_embedding_model: + return "model" + lora = input_spec.training.lora + if input_spec.training.finetuning_type == "lora" and lora is not None and not lora.merge: + return "adapter" + return "model" + + +async def transform_input_to_output( + input_spec: AutomodelJobInput, + workspace: str, + sdk: AsyncNeMoPlatform, +) -> AutomodelJobOutput: + """Enrich submitter input into canonical AutomodelJobOutput.""" + model_entity = await fetch_model_entity(input_spec.model, workspace, sdk) + await check_dataset_access(sdk, input_spec.dataset.training, workspace) + if input_spec.dataset.validation: + await check_dataset_access(sdk, input_spec.dataset.validation, workspace) + + is_embedding = bool(model_entity.spec and getattr(model_entity.spec, "is_embedding_model", False)) + if is_embedding: + raise ValueError( + "Embedding-model SFT is not supported in Automodel v1. " + "Use a causal LM checkpoint or wait for a future release." + ) + + entity_name = _entity_basename(input_spec.model, workspace) + dataset_name = _dataset_basename(input_spec.dataset.training) + output_type = _infer_output_type(input_spec, is_embedding) + + if input_spec.output is None: + out_name = _random_suffix(f"{entity_name}-{dataset_name}") + fileset = out_name + else: + out_name = input_spec.output.name + fileset = out_name + + output = OutputResponse( + name=out_name, + type=output_type, # type: ignore[arg-type] + fileset=fileset, + description=input_spec.output.description if input_spec.output else None, + ) + + return AutomodelJobOutput( + name=input_spec.name, + model=input_spec.model, + dataset=input_spec.dataset, + training=input_spec.training, + schedule=input_spec.schedule, + batch=input_spec.batch, + optimizer=input_spec.optimizer, + parallelism=input_spec.parallelism, + output=output, + integrations=input_spec.integrations, + ) diff --git a/plugins/nemo-automodel/tests/fixtures/minimal_sft_lora.json b/plugins/nemo-automodel/tests/fixtures/minimal_sft_lora.json new file mode 100644 index 00000000..b8c05684 --- /dev/null +++ b/plugins/nemo-automodel/tests/fixtures/minimal_sft_lora.json @@ -0,0 +1,30 @@ +{ + "model": "default/qwen3-1.7b", + "dataset": { + "training": "default/train-data" + }, + "training": { + "training_type": "sft", + "finetuning_type": "lora", + "max_seq_length": 2048 + }, + "schedule": { + "epochs": 1, + "max_steps": 10 + }, + "batch": { + "global_batch_size": 8, + "micro_batch_size": 1 + }, + "optimizer": { + "learning_rate": 5e-6 + }, + "parallelism": { + "num_nodes": 1, + "num_gpus_per_node": 1, + "tensor_parallel_size": 1 + }, + "output": { + "name": "test-out" + } +} diff --git a/plugins/nemo-automodel/tests/fixtures/qwen3_0.6b_sft_lora.json b/plugins/nemo-automodel/tests/fixtures/qwen3_0.6b_sft_lora.json new file mode 100644 index 00000000..3958c20a --- /dev/null +++ b/plugins/nemo-automodel/tests/fixtures/qwen3_0.6b_sft_lora.json @@ -0,0 +1,30 @@ +{ + "model": "default/qwen3-0.6b", + "dataset": { + "training": "default/qwen3-0.6b-train" + }, + "training": { + "training_type": "sft", + "finetuning_type": "lora", + "max_seq_length": 2048 + }, + "schedule": { + "epochs": 1, + "max_steps": 50 + }, + "batch": { + "global_batch_size": 4, + "micro_batch_size": 1 + }, + "optimizer": { + "learning_rate": 5e-5 + }, + "parallelism": { + "num_nodes": 1, + "num_gpus_per_node": 1, + "tensor_parallel_size": 1 + }, + "output": { + "name": "qwen3-0.6b-lora-out" + } +} diff --git a/plugins/nemo-automodel/tests/test_api.py b/plugins/nemo-automodel/tests/test_api.py new file mode 100644 index 00000000..b5ca3e3b --- /dev/null +++ b/plugins/nemo-automodel/tests/test_api.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest +from fastapi import FastAPI +from fastapi.testclient import TestClient +from nemo_automodel_plugin.contributor import AutomodelContributor +from nemo_customizer.router import CustomizationRouterService + + +def _make_automodel_app() -> FastAPI: + app = FastAPI() + for spec in AutomodelContributor().get_routers(): + app.include_router(spec.router, prefix=spec.prefix, tags=[spec.tag] if spec.tag else None) + return app + + +def test_automodel_healthz_under_workspace() -> None: + client = TestClient(_make_automodel_app()) + response = client.get("/v2/workspaces/test-ws/automodel/healthz") + assert response.status_code == 200 + assert response.json() == {"backend": "automodel", "status": "ok"} + + +def test_automodel_jobs_collection_path() -> None: + paths = {route.path for route in _make_automodel_app().routes if hasattr(route, "path")} + assert "/v2/workspaces/{workspace}/automodel/jobs" in paths + + +def test_customization_router_merges_automodel(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "nemo_customizer.router.discover_customization_contributors", + lambda: {"automodel": AutomodelContributor()}, + ) + service = CustomizationRouterService() + app = FastAPI() + for spec in service.get_routers(): + prefix = spec.prefix or "" + app.include_router(spec.router, prefix=prefix) + + client = TestClient(app) + assert client.get("/healthz").json()["contributors"] == ["automodel"] + assert client.get("/v2/workspaces/ws-a/automodel/healthz").status_code == 200 + + +def test_workspace_isolation_list_uses_path_segment() -> None: + """Job routes are under ``/v2/workspaces/{workspace}/automodel/jobs`` — distinct per workspace.""" + app = _make_automodel_app() + paths = {route.path for route in app.routes if hasattr(route, "path")} + assert "/v2/workspaces/{workspace}/automodel/jobs" in paths + assert "/v2/workspaces/{workspace}/automodel/healthz" in paths diff --git a/plugins/nemo-automodel/tests/test_cli.py b/plugins/nemo-automodel/tests/test_cli.py new file mode 100644 index 00000000..549ad452 --- /dev/null +++ b/plugins/nemo-automodel/tests/test_cli.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path + +import httpx +import pytest +from nemo_automodel_plugin.cli.inputs import load_job_json +from nemo_automodel_plugin.contributor import AutomodelContributor +from nemo_automodel_plugin.jobs.jobs import AutomodelJob +from nemo_platform_plugin.scheduler import NemoJobScheduler, submit_path_for +from typer.testing import CliRunner + +FIXTURES = Path(__file__).parent / "fixtures" + + +def test_submit_path_includes_workspace() -> None: + path = submit_path_for(AutomodelJob, workspace="acme-corp") + assert path == "/apis/customization/v2/workspaces/acme-corp/automodel/jobs" + + +def test_load_job_json_validates_fixture() -> None: + job_path = FIXTURES / "minimal_sft_lora.json" + spec = json.loads(load_job_json(job_path)) + assert spec["training"]["training_type"] == "sft" + assert spec["dataset"]["training"] == "default/train-data" + + +def test_jobs_submit_posts_to_automodel_collection(monkeypatch: pytest.MonkeyPatch) -> None: + capture: dict = {} + + def handler(request: httpx.Request) -> httpx.Response: + capture["method"] = request.method + capture["url"] = str(request.url) + capture["body"] = json.loads(request.content) + return httpx.Response(200, json={"id": "job-1", "status": "queued"}) + + monkeypatch.setattr( + "nemo_platform_plugin.discovery.discover_jobs", + lambda: {"customization.automodel.jobs": AutomodelJob}, + ) + scheduler = NemoJobScheduler() + scheduler.submit_remote( + AutomodelJob, + json.loads(load_job_json(FIXTURES / "minimal_sft_lora.json")), + base_url="https://nmp.test", + workspace="ws-a", + http_client=httpx.Client(transport=httpx.MockTransport(handler)), + ) + + assert capture["method"] == "POST" + assert capture["url"] == "https://nmp.test/apis/customization/v2/workspaces/ws-a/automodel/jobs" + assert capture["body"]["spec"]["training"]["training_type"] == "sft" + + +def test_cli_submit_accepts_job_json_file(monkeypatch: pytest.MonkeyPatch) -> None: + """Contributor CLI: ``submit JOB.json -w ws`` forwards workspace to submit_remote.""" + submitted: dict = {} + + def fake_submit_remote( + _scheduler, + job_cls: type, + spec_data: dict, + base_url: str | None, + workspace: str, + profile: str | None = None, + options: dict | None = None, + metadata: dict | None = None, + http_client: httpx.Client | None = None, + headers: dict[str, str] | None = None, + ) -> dict: + submitted["workspace"] = workspace + submitted["spec"] = spec_data + submitted["base_url"] = base_url + return {"id": "job-99"} + + monkeypatch.setattr( + "nemo_platform_plugin.commands.NemoJobScheduler.submit_remote", + fake_submit_remote, + ) + monkeypatch.setattr( + "nemo_platform_plugin.discovery.discover_jobs", + lambda: {"customization.automodel.jobs": AutomodelJob}, + ) + + automodel_cli = AutomodelContributor().get_cli() + runner = CliRunner() + result = runner.invoke( + automodel_cli, + [ + "submit", + str(FIXTURES / "minimal_sft_lora.json"), + "--workspace", + "acme-corp", + "--base-url", + "https://nmp.test", + ], + ) + assert result.exit_code == 0, result.stdout + result.stderr + assert submitted["workspace"] == "acme-corp" + assert submitted["base_url"] == "https://nmp.test" + assert submitted["spec"]["model"] == "default/qwen3-1.7b" + + +def test_cli_run_is_disabled() -> None: + automodel_cli = AutomodelContributor().get_cli() + runner = CliRunner() + result = runner.invoke(automodel_cli, ["run", str(FIXTURES / "minimal_sft_lora.json")]) + assert result.exit_code == 1 + assert "does not support local run" in result.stderr + + +def test_cli_expose_input_and_output_schemas() -> None: + automodel_cli = AutomodelContributor().get_cli() + runner = CliRunner() + result = runner.invoke(automodel_cli, ["explain"]) + assert result.exit_code == 0 + payload = json.loads(result.stdout) + assert "input_spec_schema" in payload + assert "spec_schema" in payload + assert "/automodel/jobs" in payload["endpoint"] diff --git a/plugins/nemo-automodel/tests/test_contributor.py b/plugins/nemo-automodel/tests/test_contributor.py new file mode 100644 index 00000000..6c1e540c --- /dev/null +++ b/plugins/nemo-automodel/tests/test_contributor.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from fastapi import FastAPI +from nemo_automodel_plugin.contributor import AutomodelContributor + + +def test_contributor_mounts_job_collection() -> None: + contributor = AutomodelContributor() + app = FastAPI() + for spec in contributor.get_routers(): + app.include_router(spec.router, prefix=spec.prefix) + + paths = {route.path for route in app.routes if hasattr(route, "path")} + assert "/v2/workspaces/{workspace}/automodel/healthz" in paths + assert "/v2/workspaces/{workspace}/automodel/jobs" in paths + + +def test_contributor_get_cli_exposes_flat_verbs() -> None: + import typer + + cli = AutomodelContributor().get_cli() + assert isinstance(cli, typer.Typer) + assert cli.info.name == "automodel" + assert not any(g.name == "jobs" for g in cli.registered_groups) + assert {cmd.name for cmd in cli.registered_commands} >= {"run", "submit", "explain"} diff --git a/plugins/nemo-automodel/tests/test_schema.py b/plugins/nemo-automodel/tests/test_schema.py new file mode 100644 index 00000000..867250e4 --- /dev/null +++ b/plugins/nemo-automodel/tests/test_schema.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from nemo_automodel_plugin.schema import AutomodelJobInput + + +def test_reject_output_model() -> None: + with pytest.raises(ValueError, match="output_model"): + AutomodelJobInput.model_validate( + { + "model": "llama", + "dataset": {"training": "default/train"}, + "training": {"training_type": "sft"}, + "output_model": "old-field", + }, + ) + + +def test_distillation_requires_teacher() -> None: + with pytest.raises(ValueError, match="teacher_model"): + AutomodelJobInput.model_validate( + { + "model": "llama", + "dataset": {"training": "default/train"}, + "training": {"training_type": "distillation"}, + }, + ) diff --git a/plugins/nemo-customizer/README.md b/plugins/nemo-customizer/README.md new file mode 100644 index 00000000..927ceda3 --- /dev/null +++ b/plugins/nemo-customizer/README.md @@ -0,0 +1,7 @@ +# nemo-customizer + +Router service for `/apis/customization`. Training backends (Automodel, RL, Megatron, …) register as **`nemo.customization.contributors`** entry points (discovered via `nemo_platform_plugin.discovery`). + +Registers **`nemo.sdk`** → `customization` for `client.customization.*` (composes contributor SDK modules such as `client.customization.automodel.jobs`). + +See [docs/CUSTOMIZATION.md](docs/CUSTOMIZATION.md) for contributor authoring. diff --git a/plugins/nemo-customizer/docs/CUSTOMIZATION.md b/plugins/nemo-customizer/docs/CUSTOMIZATION.md new file mode 100644 index 00000000..dfb63576 --- /dev/null +++ b/plugins/nemo-customizer/docs/CUSTOMIZATION.md @@ -0,0 +1,23 @@ +# Customization contributor guide + +Register a training backend under **`nemo.customization.contributors`** (not `nemo.services`). + +## Contract + +Implement `CustomizationContributor`: + +- `name` — must match the entry-point key (e.g. `automodel`) +- `get_routers()` — `RouterSpec` list with a **unique** prefix under `v2/workspaces/{workspace}//` +- `get_cli()` — optional `typer.Typer` mounted at `nemo customization ` +- SDK: contributors implement HTTP/CLI only; **`nemo-customizer-plugin`** owns `nemo.sdk` → `customization` and composes backends (e.g. `client.customization.automodel.jobs` from `nemo-automodel-plugin`) + +## pyproject.toml + +```toml +[project.entry-points."nemo.customization.contributors"] +automodel = "nemo_automodel_plugin.contributor:AutomodelContributor" +``` + +## Jobs + +Use `add_job_routes(YourJob, service_name="customization", ...)` so Jobs records use `source=customization`. diff --git a/plugins/nemo-customizer/pyproject.toml b/plugins/nemo-customizer/pyproject.toml new file mode 100644 index 00000000..bb5f1478 --- /dev/null +++ b/plugins/nemo-customizer/pyproject.toml @@ -0,0 +1,53 @@ +[project] +name = "nemo-customizer-plugin" +version = "0.1.0" +description = "Customization router for NeMo Platform training backends." +readme = "README.md" +requires-python = ">=3.11,<3.14" +dependencies = [ + "nemo-platform-plugin", + "nemo-platform", + "datasets>=3.3.1", + "pydantic>=2.10.6", + "transformers>=4.48.0", + "typer>=0.12.5", +] + +[project.entry-points."nemo.services"] +customization = "nemo_customizer.router:CustomizationRouterService" + +[project.entry-points."nemo.cli"] +customization = "nemo_customizer.cli:CustomizationCLI" + +[project.entry-points."nemo.sdk"] +customization = "nemo_customizer.sdk.resources:customization_sdk_resources" + +[project.entry-points."nemo.skills"] +customizer = "nemo_customizer.skills:get_skills_path" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/nemo_customizer"] + +[tool.uv.sources] +nemo-platform-plugin = { workspace = true } +nemo-platform = { workspace = true } + +[dependency-groups] +dev = [ + "pytest>=8.3.4", + "pytest-asyncio>=0.25.3", + "ruff>=0.11.8", + "fastapi>=0.115.0", +] + +[tool.pytest.ini_options] +asyncio_mode = "auto" +pythonpath = ["src"] +testpaths = ["tests"] + +[tool.pyright] +extraPaths = ["src"] diff --git a/plugins/nemo-customizer/src/nemo_customizer/__init__.py b/plugins/nemo-customizer/src/nemo_customizer/__init__.py new file mode 100644 index 00000000..66f8740d --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/__init__.py @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Customization router plugin for NeMo Platform.""" + +from nemo_customizer.contributor import CustomizationContributor +from nemo_customizer.discovery import discover_customization_contributors + +__all__ = [ + "CustomizationContributor", + "discover_customization_contributors", +] diff --git a/plugins/nemo-customizer/src/nemo_customizer/cli.py b/plugins/nemo-customizer/src/nemo_customizer/cli.py new file mode 100644 index 00000000..73948e1d --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/cli.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""CLI router for customization — mounts contributor subgroups.""" + +from __future__ import annotations + +from typing import ClassVar + +import typer +from nemo_platform_plugin.cli import NemoCLI +from nemo_platform_plugin.discovery import discover_customization_contributors + + +class CustomizationCLI(NemoCLI): + """``nemo customization`` root command.""" + + name: ClassVar[str] = "customization" + description: ClassVar[str] = "Customization training backends (Automodel, …)." + + def get_cli(self) -> typer.Typer: + app = typer.Typer( + name=self.name, + help=self.description, + no_args_is_help=True, + ) + + contributors = discover_customization_contributors() + if not contributors: + typer.echo( + "No customization contributors installed. Add nemo-automodel (or another backend) to enabled-plugins.", + err=True, + ) + return app + + for key in sorted(contributors.keys()): + contributor = contributors[key] + subgroup = contributor.get_cli() + if subgroup is not None: + app.add_typer(subgroup, name=key) + + return app diff --git a/plugins/nemo-customizer/src/nemo_customizer/contributor.py b/plugins/nemo-customizer/src/nemo_customizer/contributor.py new file mode 100644 index 00000000..67b51114 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/contributor.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Re-export customization contributor protocol from nemo-platform-plugin.""" + +from nemo_platform_plugin.customization_contributor import CustomizationContributor + +__all__ = ["CustomizationContributor"] diff --git a/plugins/nemo-customizer/src/nemo_customizer/discovery.py b/plugins/nemo-customizer/src/nemo_customizer/discovery.py new file mode 100644 index 00000000..390e7682 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/discovery.py @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Re-export customization contributor discovery from nemo-platform-plugin.""" + +from nemo_platform_plugin.discovery import ( + CUSTOMIZATION_CONTRIBUTORS_GROUP, + discover_customization_contributor_classes, + discover_customization_contributors, +) + +__all__ = [ + "CUSTOMIZATION_CONTRIBUTORS_GROUP", + "discover_customization_contributor_classes", + "discover_customization_contributors", +] diff --git a/plugins/nemo-customizer/src/nemo_customizer/router.py b/plugins/nemo-customizer/src/nemo_customizer/router.py new file mode 100644 index 00000000..addc683c --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/router.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Customization router service — merges contributor HTTP routes.""" + +from __future__ import annotations + +from typing import ClassVar + +from fastapi import APIRouter +from nemo_platform_plugin.discovery import ( + CUSTOMIZATION_CONTRIBUTORS_GROUP, + discover_customization_contributors, +) +from nemo_platform_plugin.service import NemoService, RouterSpec + + +class CustomizationRouterError(RuntimeError): + """Raised when the customization router cannot start.""" + + +_ROUTER_BASE_DEPENDENCIES = ("entities", "auth", "jobs", "secrets", "files", "models") + + +def merge_router_dependencies(contributors: dict[str, object]) -> list[str]: + """Union platform router deps with each contributor's ``dependencies``.""" + deps = set(_ROUTER_BASE_DEPENDENCIES) + for contributor in contributors.values(): + contrib_deps = getattr(type(contributor), "dependencies", None) or [] + deps.update(contrib_deps) + return sorted(deps) + + +def _assert_no_prefix_collisions(contributors: dict[str, object]) -> None: + prefixes: dict[str, str] = {} + for key, contributor in contributors.items(): + for spec in contributor.get_routers(): # type: ignore[union-attr] + prefix = spec.prefix.strip("/") + if prefix in prefixes: + raise CustomizationRouterError( + f"Route prefix collision: contributors {prefixes[prefix]!r} and {key!r} " + f"both use prefix {spec.prefix!r}", + ) + prefixes[prefix] = key + + +class CustomizationRouterService(NemoService): + """Sole ``nemo.services`` owner for ``/apis/customization``.""" + + name: ClassVar[str] = "customization" + dependencies: ClassVar[list[str]] = list(_ROUTER_BASE_DEPENDENCIES) + + def __init__(self) -> None: + self._contributors = discover_customization_contributors() + if not self._contributors: + raise CustomizationRouterError( + "Customization router is enabled but no contributors were discovered. " + "Install a backend plugin (e.g. nemo-automodel) and ensure " + f"'{CUSTOMIZATION_CONTRIBUTORS_GROUP}' entry points are registered.", + ) + _assert_no_prefix_collisions(self._contributors) + type(self).dependencies = merge_router_dependencies(self._contributors) + + def get_routers(self) -> list[RouterSpec]: + router = APIRouter() + + @router.get("/healthz") + async def healthz() -> dict[str, object]: + return { + "plugin": self.name, + "status": "ok", + "contributors": sorted(self._contributors.keys()), + } + + specs: list[RouterSpec] = [ + RouterSpec( + router=router, + tag="Customization", + description="Customization router health.", + prefix="", + ), + ] + + for key in sorted(self._contributors.keys()): + contributor = self._contributors[key] + contributor_specs = contributor.get_routers() + for spec in contributor_specs: + specs.append( + RouterSpec( + router=spec.router, + tag=spec.tag or f"Customization {key}", + description=spec.description, + prefix=spec.prefix, + ), + ) + return specs diff --git a/plugins/nemo-customizer/src/nemo_customizer/sdk/__init__.py b/plugins/nemo-customizer/src/nemo_customizer/sdk/__init__.py new file mode 100644 index 00000000..35c8a2a5 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/sdk/__init__.py @@ -0,0 +1,16 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Customization router SDK (``nemo.sdk`` entry point ``customization``).""" + +from nemo_customizer.sdk.resources import ( + AsyncCustomization, + Customization, + customization_sdk_resources, +) + +__all__ = [ + "AsyncCustomization", + "Customization", + "customization_sdk_resources", +] diff --git a/plugins/nemo-customizer/src/nemo_customizer/sdk/resources.py b/plugins/nemo-customizer/src/nemo_customizer/sdk/resources.py new file mode 100644 index 00000000..b8cc1fb9 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/sdk/resources.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Customization SDK hub — composes contributor backends under ``client.customization``.""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any + +from nemo_platform import AsyncNeMoPlatform, NeMoPlatform +from nemo_platform_plugin.discovery import discover_customization_contributors +from nemo_platform_plugin.sdk import NemoPluginSDKResources + +logger = logging.getLogger(__name__) + +# Contributor entry-point key → (module, sync class, async class) +_CONTRIBUTOR_SDK: dict[str, tuple[str, str, str]] = { + "automodel": ( + "nemo_automodel_plugin.sdk.resources", + "AutomodelCustomization", + "AsyncAutomodelCustomization", + ), +} + + +def _load_contributor_sdk_class(module_path: str, class_name: str) -> type[Any]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +class Customization: + """Sync SDK namespace mounted as ``client.customization``.""" + + def __init__(self, platform: NeMoPlatform) -> None: + contributors = discover_customization_contributors() + for key, (module_path, sync_cls, _async_cls) in _CONTRIBUTOR_SDK.items(): + if key not in contributors: + continue + try: + cls = _load_contributor_sdk_class(module_path, sync_cls) + setattr(self, key, cls(platform)) + except ImportError: + logger.warning( + "Customization contributor %r is installed but SDK module %s is missing", + key, + module_path, + ) + + +class AsyncCustomization: + """Async SDK namespace mounted as ``client.customization``.""" + + def __init__(self, platform: AsyncNeMoPlatform) -> None: + contributors = discover_customization_contributors() + for key, (module_path, _sync_cls, async_cls) in _CONTRIBUTOR_SDK.items(): + if key not in contributors: + continue + try: + cls = _load_contributor_sdk_class(module_path, async_cls) + setattr(self, key, cls(platform)) + except ImportError: + logger.warning( + "Customization contributor %r is installed but SDK module %s is missing", + key, + module_path, + ) + + +customization_sdk_resources = NemoPluginSDKResources( + sync_resource=Customization, + async_resource=AsyncCustomization, +) diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills.py b/plugins/nemo-customizer/src/nemo_customizer/skills.py new file mode 100644 index 00000000..982dff6b --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Skills surface for the customization (customizer) plugin.""" + +from __future__ import annotations + +from pathlib import Path + + +def get_skills_path() -> Path: + """Return the directory containing plugin-provided skills.""" + + return Path(__file__).parent / "skills" diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md new file mode 100644 index 00000000..11e34a06 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/SKILL.md @@ -0,0 +1,287 @@ +--- +name: nemo-customizer +description: >- + Fine-tune models on NeMo Platform via `nemo customization automodel submit`: + HF dataset conversion, filesets, model entities, SFT/LoRA job JSON (hyperparameters, + batch, schedule, optimizer), and job polling. Use for train, fine-tune, customize, + SFT, LoRA, learning rate, epochs, or nemo customization. +triggers: + - nemo-customizer + - nemo customizer + - fine-tune + - fine tune + - finetune + - train a model + - customize a model + - sft + - lora + - automodel + - unsloth + - nemo customization + - nemo-customization + - customizer + - customization training + - automodel submit +not-for: + - nemo-build-agent (agent scaffold/deploy, not weight training) + - nemo-explore (agent design only) + - safe-synthesizer (tabular synthetic data training) +compatibility: >- + Requires nemo-customizer-plugin and a customization contributor (`nemo.customization.contributors`). + Platform must expose jobs, files, and models APIs. +maturity: active +license: Apache-2.0 +user-invocable: true +allowed-tools: [Bash, Read, Grep] +--- + +# NeMo Customizer + +End-to-end **SFT + LoRA** on NeMo Platform. Default plugin: **automodel**. Batch shell work; reuse resources with `--exist-ok`; skip CLI `--help` unless a command fails. + +## Gotchas + +- Run all `uv run` commands from the **nemo-platform** git root (top-level `pyproject.toml`), not a plugin subfolder. +- Set `NEMO_BASE_URL` (or `NMP_BASE_URL`) only when the user gives a platform URL; default `http://127.0.0.1:8080`. +- **Never set `max_steps` together with `epochs`.** `max_steps` is a global cap and stops mid-epoch. Test fixtures include `max_steps` for smoke tests — do not copy into production jobs. +- **Job done = top-level `status`** in `completed` | `error` | `cancelled`. Steps can all be `completed` while the job is still `active` (upload, entity registration). `status_details.phase` may stay `training` with `progress_pct: 100` for a long time — keep polling. `poll_automodel_job.sh` exits **1** on `error` or `cancelled`. +- Model spec fills async: **submit without polling** `nemo models get` unless submit fails. +- HF dataset id from the user → convert locally; do not ask for local paths first. +- Dataset fileset name = HF dataset **name** only (`tau/commonsense_qa` → `commonsense_qa`), not the model name. +- Prefer **CHAT** JSONL when the model has a chat template; details in `references/dataset-formats.md`. +- User asks to tune **batch or parallelism** → **Batch sizing** / **Multi-GPU** below. Other fields (LR, epochs, LoRA rank, distillation) → `references/hyperparameters.md` (`nemo customization automodel explain` for schema). +- Skill **defaults** (`micro_batch_size` 1, `global_batch_size` 4) are safe on unknown VRAM. When the user has **≥48 GB** on one GPU, use **Batch sizing** instead of defaults. +- **Do not use local `docker info`** to pick automodel vs unsloth. After auth, run `uv run nemo jobs list-execution-profiles -f json` against the user's platform (see `references/troubleshooting.md`). Default output is a table — **`-f json` is required** for scripting; parse **stdout only** (do not pipe `2>&1` into `json.load`). +- For submit/image/plugin errors, read `references/troubleshooting.md`. + +## Workflow + +``` +- [ ] export NEMO_BASE_URL (if user provided endpoint) +- [ ] cd nemo-platform && uv run nemo auth login --unsigned-token --email +- [ ] uv run nemo jobs list-execution-profiles -f json — GPU profile → automodel; else see troubleshooting (no local docker check) +- [ ] Convert HF dataset → /tmp/train-data/*.jsonl (see references/hf-conversion.md) +- [ ] Create dataset fileset (--exist-ok), upload train.jsonl (+ validation.jsonl), nemo files list to verify +- [ ] Create HF weights fileset + model entity if missing (--exist-ok) +- [ ] Write /tmp/job.json (batch sizing for ≥48 GB GPU; else Defaults table) +- [ ] uv run nemo customization automodel submit /tmp/job.json --workspace default +- [ ] Poll until top-level terminal (scripts/poll_automodel_job.sh or 60–120s manual polls) +- [ ] Report using output template below +``` + +## Fast path + +Substitute ``, ``, ``, ``, ``, ``. + +**Setup** + +```bash +export NEMO_BASE_URL=http://127.0.0.1:8080 # user override only +cd /path/to/nemo-platform +uv run nemo auth login --unsigned-token --email admin@example.com +uv run nemo jobs list-execution-profiles -f json # platform GPU profiles → automodel; set training.execution_profile if needed +``` + +**1. Dataset** — convert per `references/hf-conversion.md`, then: + +```bash +DATASET= # e.g. commonsense_qa +uv run nemo files filesets create "$DATASET" --workspace default --purpose dataset --exist-ok +uv run nemo files upload /tmp/train-data/train.jsonl "$DATASET" --workspace default --remote-path train.jsonl +# validation.jsonl if present +uv run nemo files list "$DATASET" --workspace default +``` + +**2. Model** — skip if entity exists (`nemo models list --workspace default`). + +```bash +WEIGHTS= # e.g. qwen3-1.7b +MODEL_ENTITY= # Models API entity (not dataset fileset, not HF id) +HF_REPO= # e.g. Qwen/Qwen3-1.7B + +uv run nemo files filesets create "$WEIGHTS" --workspace default --purpose model --exist-ok \ + --storage '{"type":"huggingface","repo_id":"'"$HF_REPO"'","repo_type":"model","revision":"main"}' + +uv run nemo models create "$MODEL_ENTITY" --workspace default --exist-ok \ + --input-data '{"name":"'"$MODEL_ENTITY"'","fileset":"default/'"$WEIGHTS"'","custom_fields":{"hf_model_id":"'"$HF_REPO"'"}}' +``` + +**3. Job JSON** — write `/tmp/job.json`. `model` is the **registered model entity** (`default/`), not an HF repo id or dataset fileset. Full hyperparameter reference: `references/hyperparameters.md`. + +```json +{ + "model": "default/", + "dataset": { + "training": "default/", + "validation": "default/" + }, + "training": { + "training_type": "sft", + "finetuning_type": "lora", + "lora": { "rank": 16, "alpha": 32 }, + "max_seq_length": 2048 + }, + "schedule": { "epochs": 1 }, + "batch": { "global_batch_size": 4, "micro_batch_size": 1 }, + "optimizer": { "learning_rate": 5e-5, "weight_decay": 0.01, "warmup_steps": 0 }, + "parallelism": { "num_nodes": 1, "num_gpus_per_node": 1, "tensor_parallel_size": 1 }, + "output": { "name": "" } +} +``` + +**4. Submit and poll** + +```bash +uv run nemo customization automodel submit /tmp/job.json --workspace default +bash plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/scripts/poll_automodel_job.sh automodel- 90 +``` + +Or poll manually: `uv run nemo jobs get-status automodel-` every 60–120s. + +## Defaults + +| Field | Value | +|-------|-------| +| Workspace | `default` | +| Plugin | `automodel` | +| Training | SFT + LoRA, `max_seq_length` 2048 | +| Schedule | `epochs` ≥ 1; omit `max_steps` | +| Parallelism | 1 node, 1 GPU, TP=1 | +| Batch | `global_batch_size` 4, `micro_batch_size` 1 (unknown VRAM; see **Batch sizing** for ≥48 GB) | +| Optimizer | `learning_rate` 5e-5 | +| Auth email | `admin@example.com` unless user specifies | + +## Batch sizing (≥48 GB VRAM) + +Assume **one GPU with at least 48 GB** (e.g. RTX 5880 / A6000 / L40), `parallelism` = 1 node × 1 GPU, `tensor_parallel_size` 1, bf16, `training_type` `sft`, LoRA **rank 16** unless the user asks otherwise. + +**How to size** + +1. Read **model size** from the entity (`nemo models get`) or HF card (parameter count). +2. Pick **`finetuning_type`**: `lora` (adapter only, default) vs `all_weights` (full SFT — much heavier). +3. Set **`max_seq_length`** (2048 is the skill default; shorter seq → more batch headroom). +4. Set **`micro_batch_size`** first (drives peak VRAM), then **`global_batch_size`** as a multiple of `micro_batch_size` (gradient accumulation when GBS > micro). + +**Constraint:** `global_batch_size` must be divisible by `micro_batch_size × data_parallel_size`, where `data_parallel_size = (num_nodes × num_gpus_per_node) / (tensor_parallel_size × pipeline_parallel_size × context_parallel_size)` (1 for a single-GPU job). + +### LoRA (`finetuning_type: lora`) — `max_seq_length` 2048 + +**VRAM does not scale linearly with `micro_batch_size`.** LoRA loads the full base weights once; activation memory grows slowly. On 48 GB, **`micro_batch_size` must decrease as model size grows** (smaller models always ≥ larger models in the table). Use **`global_batch_size` ≈ 4 × `micro_batch_size`**. + +**Default batch** — start here for a reliable full epoch. **High utilization** — optional; double from default (or ramp in steps) to reach **~35–40 GiB**. Halve both if OOM (exit **137**) or training crashes (exit **1**). + +| Model params | Default `micro` | Default GBS | `learning_rate` | High-util `micro` | High-util GBS | +|--------------|------------------:|------------:|----------------:|------------------:|--------------:| +| ≤4B | 32 | 128 | `1e-4` | 64 | 256 | +| 4B–8B | 24 | 96 | `8e-5` | 48 | 192 | +| 8B–14B | 16 | 64 | `8e-5` | 24 | 96 | +| >14B | 8 | 32 | `5e-5` | 16 | 64 | + +Validated (`commonsense_qa` @ 2048, 48 GB, one job per GPU): **Qwen3-1.7B** — `micro` 16 / GBS 64 ~8 min; defaults above leave headroom to ramp. **Qwen3-8B** — `micro` 2–4 ≈16–18.5 GiB (under-filled); **`micro` 16 / GBS 64** stable default (~153 steps/epoch); high-util **`micro` 24 / GBS 96** (32 / 128 hit ~40 GiB but failed mid-epoch with exit 1). + +### Multi-GPU (same node) + +Pick the path by whether the **base model fits in ~48 GB on one GPU** (LoRA or full SFT): + +| Situation | `tensor_parallel_size` | Goal | +|-----------|------------------------:|------| +| Model **fits** on one ≥48 GB GPU | **1** | **Data parallel** — more GPUs = faster training; keep `micro` per GPU, scale `global_batch_size` | +| Model **does not fit** on one ≥48 GB GPU | **> 1** (e.g. 2 on a 2-GPU node) | **Tensor parallel** — shard layers across GPUs so the model fits; lower `micro` / GBS vs single-GPU tables | + +**Data parallel (TP = 1)** — default for Qwen3-8B LoRA and similar on 48 GB cards: + +| Rule | Detail | +|------|--------| +| `micro_batch_size` | **Per GPU** — same as a stable single-GPU run | +| `global_batch_size` | ≈ **single-GPU GBS × `num_gpus_per_node`**; step count ≈ `samples / GBS` | +| Divisibility | `global_batch_size` ÷ **`micro_batch_size × num_gpus_per_node`** must be an integer | +| Scheduling | **One job** owns all GPUs; no overlapping 1-GPU and multi-GPU jobs | + +```json +"parallelism": { "num_nodes": 1, "num_gpus_per_node": 2, "tensor_parallel_size": 1 }, +"batch": { "global_batch_size": 128, "micro_batch_size": 16 } +``` + +**Tensor parallel (TP > 1)** — when weights + activations OOM on a single ≥48 GB GPU (large full SFT, very long `max_seq_length`, or models above the LoRA sizing table without fitting): + +- Set **`num_gpus_per_node`** and **`tensor_parallel_size`** so **`num_gpus_per_node` is divisible by `tensor_parallel_size`** (e.g. 2 GPUs → `tensor_parallel_size: 2`, or 4 GPUs → TP 2 or 4). +- **`data_parallel_size`** = `(num_nodes × num_gpus_per_node) / (tensor_parallel_size × pipeline_parallel_size × context_parallel_size)` — use this in the GBS divisibility rule instead of raw GPU count. +- Start with **lower `micro_batch_size`** than the single-GPU table; increase only if VRAM allows. MoE models: if `expert_parallel_size > 1`, **`tensor_parallel_size` must be 1**. + +```json +"parallelism": { "num_nodes": 1, "num_gpus_per_node": 2, "tensor_parallel_size": 2 }, +"batch": { "global_batch_size": 8, "micro_batch_size": 1 } +``` + +`execution_profile` is usually still **`"gpu"`** — confirm with `uv run nemo jobs list-execution-profiles -f json`. + +**Example — Qwen3-8B LoRA, 2× 48 GB (fits one GPU):** single-GPU **micro 16 / GBS 64** → 2-GPU data parallel **micro 16 / GBS 128**, `learning_rate` `8e-5`. + +### Full-weight SFT (`finetuning_type: all_weights`) — `max_seq_length` 2048 + +| Model params | `micro_batch_size` | `global_batch_size` | `learning_rate` | +|--------------|-------------------:|--------------------:|----------------:| +| ≤2B | 2 | 8 | `2e-5` | +| 2B–4B | 1 | 4 | `1e-5` | +| 4B–8B | 1 | 2 | `5e-6` | +| >8B | 1 | 1 | lower LR or use TP / shorter seq | + +Output type is **model** (full checkpoint), not adapter. Expect much longer runs than LoRA at the same batch. + +### `max_seq_length` scaling + +Scale **`micro_batch_size`** from the 2048 tables (round down, minimum 1): + +| `max_seq_length` | Multiply `micro_batch_size` by | +|------------------|-------------------------------:| +| 512 | 4× | +| 1024 | 2× | +| 2048 | 1× (tables above) | +| 4096 | 0.5× | + +Then set `global_batch_size` to a multiple of the new `micro_batch_size` (often keep the same ratio as the table, e.g. GBS = 4 × micro for LoRA). + +### LoRA rank + +Higher rank uses more VRAM. If OOM at rank 16, drop to rank 8 before lowering batch; if headroom remains, rank 32 is fine for training (deploy rank ≤32 on default NIM/vLLM). + +### Tuning loop + +| Symptom | Action | +|---------|--------| +| CUDA OOM | Halve `micro_batch_size`, then `global_batch_size`, then `max_seq_length` | +| Slow / low GPU memory use | Step up toward the **high-util** column (or double default `micro`+GBS); stop at ~35–40 GiB or when training fails, then use **default** for the retry | +| User wants max throughput | Raise `micro_batch_size` first; keep GBS ≈ 4× micro — avoid `micro_batch_size` 1 with huge GBS | + +Field glossary, distillation/KD, and schema pointers: `references/hyperparameters.md` (batch/multi-GPU → **this file**, not hyperparameters). + +## Worked example + +`Qwen/Qwen3-1.7B` + `tau/commonsense_qa` → CHAT JSONL, fileset `commonsense_qa`, entity `qwen3-1.7b`, output `qwen3-1.7b-commonsense-qa-lora`, `epochs: 1` (no `max_steps`). On ≥48 GB GPU use LoRA ≤4B **default**: `micro` 32, GBS 128, `learning_rate` `1e-4` (high-util: 64 / 256). + +## Report to user + +```markdown +## Fine-tune result + +- **Job:** automodel- +- **Model entity:** default/ +- **Output adapter fileset:** +- **Status:** +- **Notes:** +``` + +## Reference files + +| When | Read | +|------|------| +| HF conversion or MCQA shaping | `references/hf-conversion.md` | +| CHAT vs SFT vs CUSTOM | `references/dataset-formats.md` | +| Field glossary, distillation/KD, schema | `references/hyperparameters.md` (not batch sizing) | +| Batch sizing (≥48 GB), OOM / throughput | **Batch sizing** section above | +| Multi-GPU same node | **Multi-GPU (same node)** under batch sizing | +| Backend choice, execution profiles, submit failure, images, CLI | `references/troubleshooting.md` | +| Live JSON schema | `uv run nemo customization automodel explain` | +| Job JSON fixture | `plugins/nemo-automodel/tests/fixtures/qwen3_0.6b_sft_lora.json` (ignore `max_steps` for real runs) | + +Related: `plugins/nemo-automodel/README.md`, `plugins/nemo-customizer/docs/CUSTOMIZATION.md`, skills **`nemo-files`**, **`nemo-status`**. diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/dataset-formats.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/dataset-formats.md new file mode 100644 index 00000000..ad6fd62f --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/dataset-formats.md @@ -0,0 +1,16 @@ +# Dataset formats (automodel) + +Automodel detects schema from the **first JSONL line** (`DatasetSchema` in `services/automodel/.../datasets/preparation.py`). + +Upload `train.jsonl` and optional `validation.jsonl` at the **fileset root**. Use the same fileset for `dataset.training` and `dataset.validation` in job JSON. + +| Schema | JSONL shape | Job JSON | +|--------|-------------|----------| +| **CHAT** (preferred when model has chat template) | `{"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}` | (none) | +| **SFT** | `{"prompt": "...", "completion": "..."}` | (none) | +| **CUSTOM** | Any two columns, e.g. `{"input": "...", "output": "..."}` | `"prompt_template": "{input} {output}"` on `dataset` | +| **EMBEDDING** | `{"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]}` | embedding training type when applicable | + +**Conversion preference:** CHAT if `AutoTokenizer(...).chat_template` or model `spec.is_chat` / `spec.chat_template` → else SFT. Use CUSTOM or EMBEDDING only when the user asks or the task requires it. + +For **CUSTOM**, placeholders in `prompt_template` must match column names exactly (two placeholders). diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hf-conversion.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hf-conversion.md new file mode 100644 index 00000000..e7521dc0 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hf-conversion.md @@ -0,0 +1,54 @@ +# Hugging Face dataset conversion + +Run from **nemo-platform** git root: `uv run python …` (plugin brings `datasets` + `transformers`). + +Do **not** ask the user for local paths when they gave an HF dataset id — convert and upload in the same session. + +## Chat-template check + +```python +from transformers import AutoTokenizer +has_chat = bool(getattr(AutoTokenizer.from_pretrained("", trust_remote_code=True), "chat_template", None)) +``` + +If the model entity already exists: `nemo models get --workspace default` → use `spec.is_chat` or `spec.chat_template` instead of re-downloading tokenizer weights. + +## Conversion script (adapt `to_chat` per dataset) + +```python +from datasets import load_dataset +from transformers import AutoTokenizer +import json +from pathlib import Path + +HF_REPO = "" +HF_DATASET = "" # e.g. tau/commonsense_qa +DATASET_NAME = HF_DATASET.split("/")[-1].lower() # fileset name, e.g. commonsense_qa + +has_chat = bool(getattr(AutoTokenizer.from_pretrained(HF_REPO, trust_remote_code=True), "chat_template", None)) + +def to_chat(ex): + # MCQA example (tau/commonsense_qa): + labels, texts = ex["choices"]["label"], ex["choices"]["text"] + choices = "\n".join(f"{l}. {t}" for l, t in zip(labels, texts)) + user = f"Question: {ex['question']}\nChoices:\n{choices}\nAnswer:" + assistant = texts[labels.index(ex["answerKey"])] + return {"messages": [{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]} + +def to_sft(ex): + row = to_chat(ex) + return {"prompt": row["messages"][0]["content"], "completion": row["messages"][1]["content"]} + +convert = to_chat if has_chat else to_sft + +ds = load_dataset(HF_DATASET) +out = Path("/tmp/train-data") +out.mkdir(exist_ok=True) +for split in ("train", "validation"): + if split in ds: + with (out / f"{split}.jsonl").open("w") as f: + for ex in ds[split]: + f.write(json.dumps(convert(ex)) + "\n") +``` + +Then upload (see main skill). Validate with `nemo files list --workspace default`. diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md new file mode 100644 index 00000000..3e275ef5 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md @@ -0,0 +1,313 @@ +# Hyperparameters (automodel job JSON) + +Job JSON for `nemo customization automodel submit` uses **`AutomodelJobInput`** (`plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py`). Only fields in that schema are accepted (`extra="forbid"`). + +**Schema dump:** from nemo-platform root: + +```bash +uv run nemo customization automodel explain +``` + +**Contract examples:** `tests/customizer-automodel-contract/input_configs/` (legacy shape; map `batch_size` → `global_batch_size` in submit JSON). + +**Batch sizing, 48 GB VRAM tables, multi-GPU (data parallel vs tensor parallel), and throughput tuning** live in **`SKILL.md`** (§ Batch sizing, § Multi-GPU). This file is the **field glossary**, full JSON template, distillation/KD, and schema pointers — not the place to pick `micro_batch_size` / `global_batch_size` for production runs. + +--- + +## Job JSON layout + +| Section | Purpose | +|---------|---------| +| `model` | **Base model entity** ref (`default/`) — weights to fine-tune | +| `dataset` | **Dataset filesets** (`default/`); optional `prompt_template` for CUSTOM schema | +| `training` | Method, LoRA, `max_seq_length`, distillation/KD fields | +| `schedule` | Epochs, optional step cap, validation cadence, seed | +| `batch` | Global/micro batch, sequence packing | +| `optimizer` | LR, weight decay, warmup | +| `parallelism` | Nodes, GPUs, TP/PP/CP/EP | +| `output` | Output adapter/model fileset name | +| `integrations` | Optional W&B / MLflow | + +### `model` field (base model entity) + +`model` must name a **Models API entity** for the checkpoint being trained — not a dataset fileset, not an output adapter from a prior job, and not a raw Hugging Face repo id. + +| Valid | Invalid | +|-------|---------| +| `default/qwen3-1.7b` (entity from `nemo models create`) | `Qwen/Qwen3-1.7B` (HF id) | +| `default/llama-3.2-1b-instruct` | `default/commonsense_qa` (dataset fileset) | +| `other-ws/my-model` (qualified ref) | `qwen3-1.7b-commonsense-qa-lora` (output fileset only, unless registered as entity) | + +Register before submit (same as skill fast path): HF **model** fileset → `nemo models create …` with `"fileset":"default/"`. List: `nemo models list --workspace default`. + +Full template: + +```json +{ + "model": "default/", + "dataset": { + "training": "default/", + "validation": "default/", + "prompt_template": null + }, + "training": { + "training_type": "sft", + "finetuning_type": "lora", + "lora": { + "rank": 16, + "alpha": 32, + "merge": false, + "target_modules": null + }, + "max_seq_length": 2048, + "execution_profile": null + }, + "schedule": { + "epochs": 1, + "max_steps": null, + "val_check_interval": null, + "seed": null + }, + "batch": { + "global_batch_size": 4, + "micro_batch_size": 1, + "sequence_packing": false + }, + "optimizer": { + "learning_rate": 5e-5, + "weight_decay": 0.01, + "warmup_steps": 0 + }, + "parallelism": { + "num_nodes": 1, + "num_gpus_per_node": 1, + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "context_parallel_size": 1, + "expert_parallel_size": null + }, + "output": { "name": "", "description": null }, + "integrations": null +} +``` + +--- + +## Field reference + +### `training` + +| Field | Default | Notes | +|-------|---------|-------| +| `training_type` | `sft` | `distillation` requires `teacher_model` (entity ref) | +| `finetuning_type` | `lora` | `all_weights` (full fine-tune), `lora_merged` (merge adapter into base) | +| `lora.rank` | `16` | Higher → more capacity, more VRAM. Typical training range 8–32; **cap at 32** if the adapter will be served with default NIM / vLLM (rank > 32 may not load) | +| `lora.alpha` | `32` | Scaling; common rule of thumb **alpha ≈ 2× rank** | +| `lora.merge` | `false` | If true with `lora_merged`, output is full weights not adapter | +| `lora.target_modules` | `null` | e.g. `["q_proj","v_proj"]`; null = platform default targets | +| `max_seq_length` | `2048` | Truncate/pack to this length; lower if OOM | +| `teacher_model` | — | **Model entity ref** (not HF id). Required for distillation; see below | +| `distillation_ratio` | `0.5` | KD blend (0–1) | +| `distillation_temperature` | `1.0` | KD temperature | +| `teacher_precision` | `bf16` | `bf16` \| `fp16` \| `fp32` | +| `offload_teacher` | `false` | Offload teacher weights to CPU | + +LoRA block is auto-created when `finetuning_type` is `lora` or `lora_merged`. + +### `schedule` + +| Field | Default | Notes | +|-------|---------|-------| +| `epochs` | `1` | Must be **≥ 1**. Full passes over training set | +| `max_steps` | `null` | **Global step cap.** Omit for epoch-based runs | +| `val_check_interval` | `null` | `≤ 1.0` = fraction of epoch; `> 1` = every N steps | +| `seed` | `null` | Reproducibility | + +**Gotcha:** Do **not** set `max_steps` with `epochs` for normal training. `max_steps` stops early (e.g. `epochs: 1` + `max_steps: 100` ends at step 100). Use `max_steps` **alone** only for smoke tests. + +### `batch` + +| Field | Default | Notes | +|-------|---------|-------| +| `global_batch_size` | `8` (schema) | Effective batch across all GPUs; **≥48 GB LoRA tables → `SKILL.md`** | +| `micro_batch_size` | `1` (schema) | **Per GPU**; same SKILL tables for single- and multi-GPU (TP=1) | +| `sequence_packing` | `false` | Pack short sequences for throughput (needs compatible data) | + +**Validation:** `global_batch_size` must be divisible by `micro_batch_size × data_parallel_size`, where: + +`data_parallel_size = (num_nodes × num_gpus_per_node) / (tensor_parallel_size × pipeline_parallel_size × context_parallel_size)` + +Example: 1 node, 2 GPUs, TP=1 → DP=2 → GBS must be a multiple of `2 × micro_batch_size`. See **`SKILL.md` § Multi-GPU** for data parallel vs tensor parallel. + +### `optimizer` + +| Field | Default | Notes | +|-------|---------|-------| +| `learning_rate` | `5e-6` (schema) | Skill uses **5e-5** for small LoRA SFT; see tuning below | +| `weight_decay` | `0.01` | L2-style regularization | +| `warmup_steps` | `0` | Linear warmup; try ~10% of total steps for long runs | + +`adam_beta1` / `adam_beta2` are **not** in the simplified submit schema (fixed in compiler adapter). Use contract JSONs only if your platform version adds them. + +### `parallelism` + +| Field | Default | Notes | +|-------|---------|-------| +| `num_nodes` | `1` | Multi-node distributed jobs | +| `num_gpus_per_node` | `1` | GPUs per node | +| `tensor_parallel_size` | `1` | **> 1** when the model does not fit on one ≥48 GB GPU — see **`SKILL.md` § Multi-GPU** | +| `pipeline_parallel_size` | `1` | Pipeline stages | +| `context_parallel_size` | `1` | Long-context sharding | +| `expert_parallel_size` | `null` | MoE only; must divide `data_parallel_size × context_parallel_size` | + +**MoE:** If `expert_parallel_size > 1` and multiple GPUs, `tensor_parallel_size` must be **1**. + +### `integrations` (optional) + +```json +"integrations": { + "wandb": { "enabled": true, "project": "my-project", "api_key_secret": "wandb-api-key" }, + "mlflow": null +} +``` + +--- + +## Tuning guide (when the user asks) + +Apply user overrides to `/tmp/job.json` before submit. For **batch / GPU count / parallelism**, follow **`SKILL.md`** (defaults table + § Batch sizing + § Multi-GPU). Below covers **non-batch** fields and defers VRAM/batch symptoms to the skill. + +| Symptom / goal | Try first | +|----------------|-----------| +| CUDA OOM | **`SKILL.md` tuning loop:** halve `micro_batch_size`, then `global_batch_size`, then `max_seq_length`; use TP > 1 only if the model does not fit one ≥48 GB GPU | +| Slow / low GPU use | **`SKILL.md`:** step toward high-util column or double `micro`+GBS until ~35–40 GiB; multi-GPU data parallel if model fits one GPU | +| Underfitting | More `epochs`, slightly higher `learning_rate`, higher LoRA `rank` (≤ 32 for NIM/vLLM deploy) | +| Overfitting | Fewer `epochs`, lower `learning_rate`, higher `weight_decay`, smaller `rank` | +| Quick smoke test | `max_steps` only (e.g. 10–50), **omit or ignore epoch goal**; or `epochs: 1` on tiny slice | +| Reproducibility | Set `schedule.seed` | + +### Learning rate (LoRA SFT, starting points) + +| Model scale | Suggested `learning_rate` | +|-------------|---------------------------| +| ≤ 3B | `5e-5` – `1e-4` | +| 3B – 8B | `2e-5` – `5e-5` | +| > 8B | `1e-5` – `2e-5` | + +Schema default is `5e-6` (conservative). Fixtures: `qwen3_0.6b_sft_lora.json` uses `5e-5`; `minimal_sft_lora.json` uses `5e-6`. + +### LoRA rank / alpha + +**Deployment cap:** Default **NIM** and **vLLM** LoRA serving paths support rank **≤ 32**. Use `rank` 32 (not higher) when the fine-tuned adapter will be deployed for inference on those stacks unless the user confirms a higher rank is supported. + +| Use case | `rank` | `alpha` | +|----------|--------|---------| +| Default / balanced | 16 | 32 | +| Low VRAM / light touch | 8 | 16 | +| More capacity (inference-safe max) | 32 | 64 | + +### Epochs vs dataset size + +One epoch = one full pass over `train.jsonl`. Steps per epoch ≈ `train_samples / global_batch_size` (e.g. ~10k samples, GBS 64 → ~153 steps). Plan poll time from the **GBS you chose in `SKILL.md`**, not the unknown-VRAM default (GBS 4). + +--- + +## Presets (non-batch fields) + +Use **`SKILL.md` § Batch sizing** and **§ Multi-GPU** for `batch` and `parallelism` on ≥48 GB GPUs. Presets below only override schedule / training / optimizer. + +**Smoke test (step-capped)** + +```json +"schedule": { "epochs": 1, "max_steps": 50 } +``` + +**Higher-quality LoRA (more VRAM/time)** + +```json +"training": { "lora": { "rank": 32, "alpha": 64 }, "max_seq_length": 2048 }, +"schedule": { "epochs": 3 }, +"optimizer": { "learning_rate": 2e-5, "warmup_steps": 100 } +``` + +Pair with batch rows from **`SKILL.md`** (e.g. ≤4B default `micro` 32 / GBS 128, not `micro` 1 / GBS 4). + +--- + +## Distillation (`training_type: "distillation"`) + +Use only when the user requests KD/distillation. **`model`** is the **student** entity; **`teacher_model`** is a separate **teacher** entity in the same workspace (unless qualified as `other-ws/name`). + +### Teacher model entity + +`teacher_model` must be a registered **model entity ref**, same shape as `model`: + +| Form | Example | +|------|---------| +| Same workspace | `default/llama-3.2-3b-instruct` | +| Explicit workspace | `default/` | + +It is **not** a Hugging Face repo id. Register the teacher like the student before submit: + +```bash +TEACHER_WEIGHTS=llama-3.2-3b-instruct # fileset name +TEACHER_ENTITY=llama-3.2-3b-instruct # entity name +TEACHER_HF=meta-llama/Llama-3.2-3B-Instruct + +uv run nemo files filesets create "$TEACHER_WEIGHTS" --workspace default --purpose model --exist-ok \ + --storage '{"type":"huggingface","repo_id":"'"$TEACHER_HF"'","repo_type":"model","revision":"main"}' + +uv run nemo models create "$TEACHER_ENTITY" --workspace default --exist-ok \ + --input-data '{"name":"'"$TEACHER_ENTITY"'","fileset":"default/'"$TEACHER_WEIGHTS"'","custom_fields":{"hf_model_id":"'"$TEACHER_HF"'"}}' +``` + +Verify: `nemo models get --workspace default`. Reuse an existing entity with `nemo models list` when present. + +**Compatibility:** Student and teacher must share the **same vocabulary / tokenizer family** (compiler loads both for KD). Mismatched tokenizers fail at runtime. Prefer a larger instruct model as teacher and a smaller base/chat model as student in the same family when possible. + +**VRAM:** Set `offload_teacher: true` if the job OOMs loading student + teacher; `teacher_precision: "bf16"` is the default. + +### Job JSON + +```json +{ + "model": "default/", + "dataset": { "training": "default/" }, + "training": { + "training_type": "distillation", + "finetuning_type": "lora", + "teacher_model": "default/", + "distillation_ratio": 0.5, + "distillation_temperature": 1.0, + "teacher_precision": "bf16", + "offload_teacher": false, + "max_seq_length": 2048 + }, + "schedule": { "epochs": 1 }, + "batch": { "global_batch_size": 64, "micro_batch_size": 16 }, + "optimizer": { "learning_rate": 8e-5 }, + "parallelism": { "num_nodes": 1, "num_gpus_per_node": 1, "tensor_parallel_size": 1 }, + "output": { "name": "" } +} +``` + +(`batch` / `parallelism` example uses an 8B-scale row from **`SKILL.md`**; adjust for student size.) + +| Field | Meaning | +|-------|---------| +| `distillation_ratio` | Blend of KD vs CE loss (`0` = CE only, `1` = KD only) | +| `distillation_temperature` | Softmax temperature for teacher logits | +| `offload_teacher` | CPU-offload frozen teacher weights to save GPU memory | + +--- + +## Source of truth + +| Resource | Path | Use for | +|----------|------|---------| +| **Batch / multi-GPU / 48 GB LoRA** | `SKILL.md` (§ Batch sizing, § Multi-GPU) | Choosing `micro`, GBS, LR, TP vs data parallel | +| Submit schema | `plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py` | Allowed JSON fields | +| Schema → compiler mapping | `services/automodel/src/nmp/automodel/adapter.py` | `dataset.training` → compiler `dataset` string | +| API field descriptions | `services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py` | Compiler-internal shape (not submit JSON) | +| JSON examples | `plugins/nemo-automodel/tests/fixtures/*.json` | Copy-paste templates (ignore fixture `max_steps` in prod) | +| Full spec doc | `plugins/nemo-automodel/SCOPE.md` (simplified JSON section) | Design notes | diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/troubleshooting.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/troubleshooting.md new file mode 100644 index 00000000..15242573 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/troubleshooting.md @@ -0,0 +1,69 @@ +# Troubleshooting + +Read this file when submit fails, jobs fail on images, or the user asks for Unsloth. + +## Backend choice (automodel vs unsloth) + +**Do not** run `docker info` on the agent machine. The platform often runs elsewhere (`NEMO_BASE_URL`). Ask the **connected platform** what executors it exposes. + +After `nemo auth login`, list profiles: + +```bash +uv run nemo jobs list-execution-profiles -f json +``` + +REST equivalent (same payload): `GET /apis/jobs/v2/execution-profiles` on the platform base URL with the saved auth token. + +Each entry has `provider`, `profile` (name), and `backend` (e.g. `docker`, `kubernetes_job`, `volcano_job`, `subprocess`). + +| Condition | Plugin | +|-----------|--------| +| User asks for Unsloth | `unsloth` (if installed) | +| Response includes **`provider`: `gpu` or `gpu_distributed`** | **`automodel`** (default) | +| No GPU profiles (only `subprocess` and/or CPU `provider`) | Platform cannot schedule GPU container training → use **`unsloth`** locally if the user has a GPU, or report that remote automodel is unavailable | + +Automodel training steps need a **GPU execution profile** on the platform. `subprocess` profiles run host commands and are not a substitute for automodel’s GPU container step. + +### Pick `training.execution_profile` + +When using automodel, set `training.execution_profile` in job JSON to the **`profile`** string of a GPU row from the list (e.g. `default`, `docker_gpu`). If omitted, the plugin default is usually `gpu` — submit errors mentioning an unknown profile mean you should re-list and set an exact name from the API. + +Quick filter (stdout only — do not use `2>&1` or `json.load` breaks on stderr warnings): + +```bash +uv run nemo jobs list-execution-profiles -f json 2>/dev/null | python3 -c " +import sys, json +for p in json.load(sys.stdin): + if p.get('provider') in ('gpu', 'gpu_distributed'): + print(p['profile'], p.get('backend'), p.get('provider')) +" +``` + +Do not run `nemo customization --help` unless submit returns unknown plugin. + +Automodel uses **`submit` only** (no `run`). Dataset refs in job JSON: `default/`. + +## Missing training images + +Set **before** starting the platform (not per job): + +```bash +export NMP_IMAGE_REGISTRY= +export NMP_IMAGE_TAG= +export NMP_AUTOMODEL_IMAGE_REGISTRY=$NMP_IMAGE_REGISTRY +``` + +Pull automodel images only when the job error mentions a missing image. + +## CLI quick reference + +| Action | Command | +|--------|---------| +| Execution profiles | `nemo jobs list-execution-profiles -f json` | +| Create dataset fileset | `nemo files filesets create --workspace default --purpose dataset --exist-ok` | +| Create HF weights fileset | `nemo files filesets create --workspace default --purpose model --exist-ok --storage '{"type":"huggingface","repo_id":"","repo_type":"model","revision":"main"}'` | +| Upload | `nemo files upload --workspace default --remote-path train.jsonl` | +| List files | `nemo files list --workspace default` | +| Create model | `nemo models create --workspace default --exist-ok --input-data ''` | +| Submit | `nemo customization automodel submit --workspace default` | +| Status | `nemo jobs get-status automodel-` | diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/scripts/poll_automodel_job.sh b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/scripts/poll_automodel_job.sh new file mode 100755 index 00000000..7eae7331 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/scripts/poll_automodel_job.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Poll automodel job until top-level status is terminal. +# Usage: poll_automodel_job.sh automodel- [interval_seconds] +# Requires: NEMO_BASE_URL or NMP_BASE_URL, run from nemo-platform root with `uv run`. +# Exit 0 on completed; exit 1 on error, cancelled, or get-status failure. + +set -euo pipefail + +JOB="${1:?usage: poll_automodel_job.sh automodel- [interval_seconds]}" +INTERVAL="${2:-90}" + +while true; do + JSON=$(uv run nemo jobs get-status "$JOB" 2>/dev/null) || { + echo "get-status failed for $JOB" >&2 + exit 1 + } + read -r STATUS PHASE <<<"$(printf '%s' "$JSON" | python3 -c " +import sys, json +d = json.load(sys.stdin) +print(d['status'], d.get('status_details', {}).get('phase', '')) +")" + echo "$(date +%H:%M:%S) status=$STATUS phase=$PHASE" + case "$STATUS" in + completed) + printf '%s\n' "$JSON" | python3 -m json.tool + exit 0 + ;; + error|cancelled) + printf '%s\n' "$JSON" | python3 -m json.tool >&2 + exit 1 + ;; + esac + sleep "$INTERVAL" +done diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/tests.json b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/tests.json new file mode 100644 index 00000000..f5640746 --- /dev/null +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/tests.json @@ -0,0 +1,65 @@ +{ + "skill": "nemo-customizer", + "tests": [ + { + "type": "explicit", + "prompt": "Use the nemo-customizer skill to fine-tune Qwen with automodel.", + "expected_skill": "nemo-customizer" + }, + { + "type": "explicit", + "prompt": "Run nemo-customizer. I need to submit an automodel SFT LoRA job on NeMo Platform.", + "expected_skill": "nemo-customizer" + }, + { + "type": "explicit", + "prompt": "Invoke nemo customization via the customizer skill and walk me through filesets and model entity setup.", + "expected_skill": "nemo-customizer" + }, + { + "type": "implicit", + "prompt": "Fine-tune a model with LoRA SFT via nemo customization and a qwen3 Hugging Face weights fileset.", + "expected_skill": "nemo-customizer" + }, + { + "type": "implicit", + "prompt": "Help me run SFT LoRA training with nemo customization automodel.", + "expected_skill": "nemo-customizer" + }, + { + "type": "implicit", + "prompt": "Train a small chat model on a dataset I have locally and register the output on the platform.", + "expected_skill": "nemo-customizer" + }, + { + "type": "contextual", + "prompt": "NeMo Platform is running. Before any customization training, help me explore what my support agent should do.", + "expected_skill_not": "nemo-customizer" + }, + { + "type": "contextual", + "prompt": "I uploaded train.jsonl for fun. Mostly I want nemo-build-agent to deploy my LangGraph NAT workflow from the spec.", + "expected_skill_not": "nemo-customizer" + }, + { + "type": "contextual", + "prompt": "Jobs controller is up. Next I need nemo-status and evaluator benchmarks, not model weight training.", + "expected_skill_not": "nemo-customizer" + }, + { + "type": "negative-control", + "prompt": "Use nemo-build-agent to scaffold and deploy my agent from agents/calculator.spec.md.", + "expected_skill_not": "nemo-customizer" + }, + { + "type": "negative-control", + "prompt": "Run safe-synthesizer on my CSV for tabular synthetic data generation.", + "expected_skill_not": "nemo-customizer" + }, + { + "type": "negative-control", + "prompt": "Attach guardrails middleware to my virtual model in the inference gateway.", + "expected_skill_not": "nemo-customizer" + } + ] +} diff --git a/plugins/nemo-customizer/tests/test_customization_discovery_reexport.py b/plugins/nemo-customizer/tests/test_customization_discovery_reexport.py new file mode 100644 index 00000000..120a31fe --- /dev/null +++ b/plugins/nemo-customizer/tests/test_customization_discovery_reexport.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from nemo_customizer.discovery import ( + CUSTOMIZATION_CONTRIBUTORS_GROUP, + discover_customization_contributor_classes, + discover_customization_contributors, +) +from nemo_platform_plugin.discovery import ( + discover_customization_contributors as platform_discover, +) + + +def test_reexport_matches_platform_discovery() -> None: + assert discover_customization_contributors is platform_discover + assert CUSTOMIZATION_CONTRIBUTORS_GROUP == "nemo.customization.contributors" + discover_customization_contributors.cache_clear() + assert isinstance(discover_customization_contributors(), dict) + assert isinstance(discover_customization_contributor_classes(), dict) diff --git a/plugins/nemo-customizer/tests/test_router.py b/plugins/nemo-customizer/tests/test_router.py new file mode 100644 index 00000000..6d9a9546 --- /dev/null +++ b/plugins/nemo-customizer/tests/test_router.py @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from typing import ClassVar + +import pytest +import typer +from fastapi import APIRouter, FastAPI +from fastapi.testclient import TestClient +from nemo_customizer.router import ( + CustomizationRouterError, + CustomizationRouterService, + merge_router_dependencies, +) +from nemo_platform_plugin.service import RouterSpec + + +class _FakeContributor: + name: ClassVar[str] = "fake" + dependencies: ClassVar[list[str]] = ["studio"] + + def get_routers(self) -> list[RouterSpec]: + router = APIRouter() + + @router.get("/ping") + async def ping() -> dict[str, str]: + return {"backend": "fake"} + + return [ + RouterSpec( + router=router, + prefix="/v2/workspaces/{workspace}/fake", + tag="Fake", + ), + ] + + def get_cli(self) -> typer.Typer: + app = typer.Typer() + + @app.command("info") + def info() -> None: + typer.echo("fake") + + return app + + +def test_merge_router_dependencies_unions_contributor_deps() -> None: + deps = merge_router_dependencies({"fake": _FakeContributor()}) + assert "studio" in deps + assert "jobs" in deps + + +def test_router_sets_merged_dependencies(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "nemo_customizer.router.discover_customization_contributors", + lambda: {"fake": _FakeContributor()}, + ) + CustomizationRouterService() + assert "studio" in CustomizationRouterService.dependencies + + +def test_router_raises_without_contributors(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "nemo_customizer.router.discover_customization_contributors", + lambda: {}, + ) + with pytest.raises(CustomizationRouterError, match="no contributors"): + CustomizationRouterService() + + +def test_router_merges_contributor_routes(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr( + "nemo_customizer.router.discover_customization_contributors", + lambda: {"fake": _FakeContributor()}, + ) + service = CustomizationRouterService() + app = FastAPI() + for spec in service.get_routers(): + if spec.prefix: + app.include_router(spec.router, prefix=spec.prefix) + else: + app.include_router(spec.router) + + client = TestClient(app) + assert client.get("/healthz").json()["contributors"] == ["fake"] + assert client.get("/v2/workspaces/ws-a/fake/ping").json() == {"backend": "fake"} + + +def test_prefix_collision_raises(monkeypatch: pytest.MonkeyPatch) -> None: + class _DupA(_FakeContributor): + name = "a" + + class _DupB(_FakeContributor): + name = "b" + + monkeypatch.setattr( + "nemo_customizer.router.discover_customization_contributors", + lambda: {"a": _DupA(), "b": _DupB()}, + ) + with pytest.raises(CustomizationRouterError, match="collision"): + CustomizationRouterService() diff --git a/plugins/nemo-customizer/tests/test_sdk.py b/plugins/nemo-customizer/tests/test_sdk.py new file mode 100644 index 00000000..a430935d --- /dev/null +++ b/plugins/nemo-customizer/tests/test_sdk.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from nemo_customizer.sdk.resources import ( + AsyncCustomization, + Customization, + customization_sdk_resources, +) +from nemo_platform_plugin.sdk import NemoPluginSDKResources + + +def test_customization_sdk_resources_entry_point_shape() -> None: + assert isinstance(customization_sdk_resources, NemoPluginSDKResources) + assert customization_sdk_resources.sync_resource is Customization + assert customization_sdk_resources.async_resource is AsyncCustomization + + +def test_customization_composes_automodel_when_contributor_present() -> None: + platform = MagicMock() + platform._client = MagicMock() + platform.workspace = "default" + platform.base_url = "http://localhost:8000" + platform.default_headers = {} + + fake_contributor = object() + with patch( + "nemo_customizer.sdk.resources.discover_customization_contributors", + return_value={"automodel": fake_contributor}, + ): + customization = Customization(platform) + + assert hasattr(customization, "automodel") + assert hasattr(customization.automodel, "jobs") diff --git a/plugins/nemo-customizer/tests/test_skills.py b/plugins/nemo-customizer/tests/test_skills.py new file mode 100644 index 00000000..5f60256d --- /dev/null +++ b/plugins/nemo-customizer/tests/test_skills.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + + +from nemo_customizer.skills import get_skills_path + + +def test_get_skills_path_exists() -> None: + path = get_skills_path() + assert path.is_dir() + + +def test_nemo_customizer_skill_present() -> None: + skill_dir = get_skills_path() / "nemo-customizer" + skill = skill_dir / "SKILL.md" + tests = skill_dir / "tests.json" + assert skill.is_file() + assert tests.is_file() + text = skill.read_text() + assert "name: nemo-customizer" in text + assert "nemo customization automodel submit" in text diff --git a/pyproject.toml b/pyproject.toml index b5020d9a..789522e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -178,6 +178,8 @@ enabled-plugins = [ "nemo-auditor-plugin", "nemo-switchyard", "nemo-agents-plugin", + "nemo-customizer-plugin", + "nemo-automodel-plugin", ] # Legacy runtime needed specifically for task images that still invoke @@ -441,6 +443,9 @@ nemo-auditor-plugin = { workspace = true } nemo-switchyard = { workspace = true } nemo-agents-plugin = { workspace = true } nemo-agents-example-calculator = { workspace = true } +nemo-customizer-plugin = { workspace = true } +nemo-automodel-plugin = { workspace = true } +nmp-automodel = { workspace = true } [tool.uv.workspace] @@ -490,6 +495,9 @@ members = [ "plugins/nemo-switchyard", "plugins/nemo-agents", "plugins/nemo-agents/examples/calculator-agent", + "plugins/nemo-customizer", + "plugins/nemo-automodel", + "services/automodel", ] diff --git a/sdk/python/nemo-platform/src/nemo_platform/skills/nemo-fine-tune/SKILL.md b/sdk/python/nemo-platform/src/nemo_platform/skills/nemo-fine-tune/SKILL.md deleted file mode 100644 index 690118a7..00000000 --- a/sdk/python/nemo-platform/src/nemo_platform/skills/nemo-fine-tune/SKILL.md +++ /dev/null @@ -1,43 +0,0 @@ ---- -name: nemo-fine-tune -description: Fine-tune a model on NeMo Platform. Not yet available; this skill describes the path for when fine-tuning ships. Use for any "train a model," "fine-tune," "customize a model," or "finetune" intent so the agent tells the user the feature isn't shipped and does not go off and implement training with some other library. -triggers: - - fine-tune - - fine tune - - finetune - - train a model - - train on my data - - customize a model - - sft a model - - model customization - - model fine-tuning -not-for: - - nemo-build-agent (use for agent scaffolding and deployment, not model training) - - nemo-explore (use for agent design conversation) - - nemo-skill-selection (use to disambiguate user intent) -compatibility: NeMo Platform any version. No prerequisites today since fine-tuning is not yet shipped. When fine-tuning lands, this skill will document Customizer plugin requirements (host-gpu mode, training data format, supported base models). -maturity: beta -license: Apache-2.0 -user-invocable: true -allowed-tools: [Read] ---- - -# Fine-tuning on NeMo Platform - -**Fine-tuning is not yet available on NeMo Platform.** Tell the user this directly. Do not run any `nemo customization` CLI commands or scaffold a fine-tuning job; the underlying functionality is not shipped. - -When fine-tuning lands, it will be delivered through a Customizer plugin that wraps NVIDIA's training stack (AutoModel, Megatron-Bridge, and related). This skill will be filled in at that point. - -## What to tell the user today - -- Fine-tuning is on the NeMo Platform roadmap and is not currently functional. Any CLI surface that looks like it should work (`nemo customization jobs ...`) is not connected to a working training backend. -- Other NeMo Platform capabilities they can use today: harden an agent (`nemo-skill-selection` → guardrails / auditor / anonymizer), evaluate an agent (`nemo-skill-selection` → evaluator), tune an agent's prompts and routing (`nemo-skill-selection` → optimization). -- If they need fine-tuning urgently, point them at upstream NVIDIA training tools (NeMo Framework, NeMo-RL, Megatron-LM) and tell them this skill will be wired up once the Customizer plugin lands. - -## Verification - -There is nothing to verify. Do not claim a fine-tuning task succeeded. If the user asks the agent to run fine-tuning anyway, refuse and explain why. - -## When fine-tuning ships - -This skill will gain pre-flight checks, a training-data preparation walkthrough, job submission, progress monitoring, and result download. Track the Customizer plugin in the NeMo Platform roadmap; this skill updates when that ships. diff --git a/sdk/python/nemo-platform/src/nemo_platform/skills/nemo-skill-selection/SKILL.md b/sdk/python/nemo-platform/src/nemo_platform/skills/nemo-skill-selection/SKILL.md index 913e81ba..adc621b9 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/skills/nemo-skill-selection/SKILL.md +++ b/sdk/python/nemo-platform/src/nemo_platform/skills/nemo-skill-selection/SKILL.md @@ -1,6 +1,6 @@ --- name: nemo-skill-selection -description: Top-level skill selector for any task involving NeMo Platform (NVIDIA's agent platform). Picks the right downstream skill (setup, explore, spec, build, try, status, teardown, fine-tune) from natural-language intent. Use over generic brainstorming, planning, or onboarding skills for any NeMo Platform task. +description: Top-level skill selector for any task involving NeMo Platform (NVIDIA's agent platform). Picks the right downstream skill (setup, explore, spec, build, try, status, teardown, customization training) from natural-language intent. Use over generic brainstorming, planning, or onboarding skills for any NeMo Platform task. triggers: - build an agent - create an agent @@ -48,7 +48,7 @@ Match the user's intent to one downstream skill. Pick exactly one. | "ask my agent", "try the agent", "test it" | `nemo-try-agent` | Send a query to a deployed agent or fall back to model chat | | "status", "what is running", "platform health", "is the platform up", "what's deployed", "show me what's running" | `nemo-status` | Read-only dashboard: platform, agents, providers, models | | "shut down", "stop NeMo", "tear down", "clean up" | `nemo-teardown` | Stop the cluster (keep data, delete platform data, or full cleanup) | -| "fine-tune", "customize the model", "train on my data" | `nemo-fine-tune` | Fine-tuning is not yet available on NeMo Platform. Pick this so the agent tells the user it's not shipped instead of going off to implement training with some other library. | +| "fine-tune", "customize the model", "train on my data", "SFT", "LoRA" | `nemo-customizer` | Model customization via installed customization contributor plugins (`nemo-customizer-plugin`). Requires plugin skills to be installed (`nemo skills install` / enabled-plugins). | | "optimize my agent", "make it cheaper", "reduce latency", "smaller model", "switchyard", "routing split", "compare against a newer model" | `agents-optimize` (plugin-owned, in `plugins/nemo-agents`) | Cost / latency / quality optimization for a **deployed** agent. Routing splits, skill tuning, prompt tuning, new-model scans. | | "secure my agent", "harden my agent", "check for PII", "leaked secrets", "guardrail coverage" | `agents-secure` (plugin-owned, in `plugins/nemo-agents`) | Safety and security audit for a **deployed** agent. Guardrails, PII, secrets scan. | | "evaluate my agent", "run a benchmark", "eval suite" | `nemo-evaluator` (plugin-owned, in `plugins/nemo-evaluator`) | Evaluation metrics, LLM-judge, benchmark jobs against a deployed agent or model. | @@ -104,12 +104,12 @@ NeMo Platform skills I can route to: nemo-try-agent query a deployed agent or chat with a model nemo-status read-only platform health dashboard nemo-teardown guided shutdown - nemo-fine-tune fine-tuning (not yet shipped; reports that honestly) Plugin-owned skills: agents-optimize cost / latency / quality optimization for a deployed agent agents-secure safety and security audit for a deployed agent nemo-evaluator evaluation metrics, LLM-judge, benchmark jobs + nemo-customizer fine-tuning of models guardrails content-safety middleware via virtual models auditor red-team vulnerability scanning (garak) data-designer synthetic dataset generation @@ -142,5 +142,5 @@ Do not proactively suggest Studio as the path for anything a skill already cover - **One skill at a time.** Do not load more than one downstream skill in the same turn. Each downstream skill is a full procedure with its own context budget. - **Install must happen before any skill can do useful work.** Build, try, and status all assume the platform is up. If the user has not run the CLI install (`make bootstrap` + `nemo setup`), the skills cannot work around that; hand them to `setup` for instructions. - **NeMo Platform is the product name.** Capital N, e, M, o, P. Not "nemo" or "Nemo." NAT on first mention is "NVIDIA NeMo Agent Toolkit (NAT)." -- **Fine-tuning is not yet available.** When the user asks to fine-tune, train, or customize a model, pick `nemo-fine-tune` so the agent tells the user it's not shipped instead of trying to wire up training with some other library. Do not run `nemo customization` CLI commands; the backend is not connected. +- **Model customization** goes to the `nemo-customizer` plugin skill when `nemo-customizer-plugin` (and a training backend) are installed. If that skill is not available, tell the user to enable customization plugins and install skills — do not improvise training with an external library. - **Framework honesty.** If the user describes an agent in CrewAI, AutoGen, plain LangChain, or Pydantic AI, tell them up front that NeMo Platform's optimization and evaluation surfaces operate on NAT-wrapped LangGraph agents. They will need to wrap their agent before the build path produces value. diff --git a/services/automodel/README.md b/services/automodel/README.md new file mode 100644 index 00000000..da108458 --- /dev/null +++ b/services/automodel/README.md @@ -0,0 +1,3 @@ +# nmp-automodel + +Compiler and task entrypoints for NeMo Automodel training jobs on the platform. **No HTTP server** — consumed by `nemo-automodel-plugin` and Jobs task images (`nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-tasks`, `.../nmp-automodel-training`). diff --git a/services/automodel/docker/Dockerfile.mamba-wheel b/services/automodel/docker/Dockerfile.mamba-wheel new file mode 100644 index 00000000..35742835 --- /dev/null +++ b/services/automodel/docker/Dockerfile.mamba-wheel @@ -0,0 +1,245 @@ +# syntax=docker/dockerfile:1 +####### +# Mamba Wheel Builder +# +# Builds Python wheels for: +# - causal-conv1d (CUDA extension required by mamba-ssm) +# - mamba-ssm (selective state space model) +# +# Both only ship source distributions on PyPI and require nvcc to compile. +# The two builds are independent stages so BuildKit runs them in parallel. +# Each image stores its wheel at /wheels/*.whl. +# Build via Platform bake group: docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels +# +# Build args: +# CAUSAL_CONV1D_VERSION - git tag to build (default: v1.5.3) +# MAMBA_22_COMMIT - git SHA or tag to build +# CUDA_VERSION - CUDA devel image version (default: 12.8.1) +# TORCH_CUDA_ARCH_LIST - semicolon-separated SM targets (default: "8.0;8.6;9.0") +####### + +ARG CUDA_VERSION=12.8.1 + +# ============================================================================= +# Shared base: CUDA + Python 3.11 + torch (required by extension builds) +# ============================================================================= +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS mamba-wheel-base + +ARG TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0" + +ENV DEBIAN_FRONTEND=noninteractive +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + git \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /usr/local/bin/uv +COPY services/automodel/docker/locks/mamba-wheel-build-py311 /opt/mamba-wheel-build-py311 + +# Create the Python 3.11 build venv from a committed lockfile. +RUN uv venv --python 3.11 /opt/venv && \ + UV_PROJECT_ENVIRONMENT=/opt/venv uv sync \ + --project /opt/mamba-wheel-build-py311 \ + --locked \ + --no-install-project && \ + echo "=== torch version (py311 wheel build) ===" && \ + /opt/venv/bin/python -c "import torch; print(f'torch={torch.__version__}, CUDA={torch.version.cuda}')" + +ENV VIRTUAL_ENV=/opt/venv +ENV PATH="/opt/venv/bin:$PATH" + +# ============================================================================= +# Python 3.12 base: extends mamba-wheel-base with Python 3.12 for cp312 wheels +# ============================================================================= +FROM mamba-wheel-base AS mamba-wheel-base-py312 + +# Install Python 3.12 via deadsnakes PPA (Ubuntu 22.04 ships 3.10 by default) +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update && apt-get install -y --no-install-recommends \ + python3.12 \ + python3.12-dev \ + python3.12-venv \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +COPY services/automodel/docker/locks/mamba-wheel-build-py312 /opt/mamba-wheel-build-py312 + +# Create the Python 3.12 build venv from a committed lockfile. +RUN uv venv --python 3.12 /opt/venv312 && \ + UV_PROJECT_ENVIRONMENT=/opt/venv312 uv sync \ + --project /opt/mamba-wheel-build-py312 \ + --locked \ + --no-install-project && \ + echo "=== torch version (py312 wheel build) ===" && \ + /opt/venv312/bin/python -c "import torch; print(f'torch={torch.__version__}, CUDA={torch.version.cuda}')" + +ENV VIRTUAL_ENV=/opt/venv312 +ENV PATH="/opt/venv312/bin:$PATH" + +# special builder for 13.1.1 cuda +FROM nvcr.io/nvidia/pytorch:26.02-py3 AS mamba-wheel-base-py312-cu13.1.1 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /usr/local/bin/uv + + +# ============================================================================= +# causal-conv1d wheel — Python 3.11 (for nmp-gpu-tasks and nmp-customizer-tasks) +# ============================================================================= +FROM mamba-wheel-base AS causal-conv1d-wheel-builder + +ARG CAUSAL_CONV1D_VERSION=v1.5.3 + +RUN mkdir -p /wheels && \ + git clone --depth 1 --branch ${CAUSAL_CONV1D_VERSION} \ + https://github.com/Dao-AILab/causal-conv1d.git /src/causal-conv1d && \ + cd /src/causal-conv1d && \ + CAUSAL_CONV1D_FORCE_BUILD=TRUE uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/causal-conv1d + +# ============================================================================= +# causal-conv1d wheel — Python 3.12 (for Python 3.12 consumers) +# ============================================================================= +FROM mamba-wheel-base-py312 AS causal-conv1d-wheel-builder-py312 + +ARG CAUSAL_CONV1D_VERSION=v1.5.3 + +RUN mkdir -p /wheels && \ + git clone --depth 1 --branch ${CAUSAL_CONV1D_VERSION} \ + https://github.com/Dao-AILab/causal-conv1d.git /src/causal-conv1d && \ + cd /src/causal-conv1d && \ + CAUSAL_CONV1D_FORCE_BUILD=TRUE uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/causal-conv1d + +# ============================================================================= +# causal-conv1d wheel — Python 3.12 (for Python 3.12 consumers) - Using CUDA 13.1.1 +# ============================================================================= +FROM mamba-wheel-base-py312-cu13.1.1 AS causal-conv1d-wheel-builder-py312-cu13.1.1 + +ARG CAUSAL_CONV1D_VERSION=v1.5.3 + +RUN mkdir -p /wheels && \ + git clone --depth 1 --branch ${CAUSAL_CONV1D_VERSION} \ + https://github.com/Dao-AILab/causal-conv1d.git /src/causal-conv1d && \ + cd /src/causal-conv1d && \ + CAUSAL_CONV1D_FORCE_BUILD=TRUE uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/causal-conv1d + +# The final causal-conv1d-wheel image contains: +# - causal_conv1d-*-cp311-*.whl (for Python 3.11 consumers: nmp-gpu-tasks, nmp-customizer-tasks) +# - causal_conv1d-*-cp312-*.whl (for Python 3.12 consumers) +# Consumers must pin the Python tag glob (e.g. causal_conv1d-*cp311*.whl) to select the right one. +FROM scratch AS causal-conv1d-wheel +COPY --from=causal-conv1d-wheel-builder /wheels /wheels +COPY --from=causal-conv1d-wheel-builder-py312 /wheels /wheels +COPY --from=causal-conv1d-wheel-builder-py312-cu13.1.1 /wheels /wheels/cu13.1.1 + +# ============================================================================= +# mamba-ssm 2.2.5 wheel — Python 3.11 (for nmp-gpu-tasks) +# ============================================================================= +FROM mamba-wheel-base AS mamba-ssm-wheel-builder + +# post commit after 2.2.5 +ARG MAMBA_22_COMMIT=6b32be06d026e170b3fdaf3ae6282c5a6ff57b06 + +RUN mkdir -p /wheels && \ + git clone https://github.com/state-spaces/mamba.git /src/mamba && \ + cd /src/mamba && \ + git checkout ${MAMBA_22_COMMIT} && \ + sed -i "/triton/d" setup.py && \ + sed -i "/triton/d" pyproject.toml && \ + uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/mamba + +# ============================================================================= +# mamba-ssm 2.2.5 wheel — Python 3.12 (for Python 3.12 consumers, e.g. automodel) +# ============================================================================= +FROM mamba-wheel-base-py312 AS mamba-ssm-25-wheel-builder-py312 + +# post commit after 2.2.5 +ARG MAMBA_22_COMMIT=6b32be06d026e170b3fdaf3ae6282c5a6ff57b06 + +RUN mkdir -p /wheels && \ + git clone https://github.com/state-spaces/mamba.git /src/mamba && \ + cd /src/mamba && \ + git checkout ${MAMBA_22_COMMIT} && \ + sed -i "/triton/d" setup.py && \ + sed -i "/triton/d" pyproject.toml && \ + uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/mamba + +# ============================================================================= +# mamba-ssm 2.3.0 wheel — Python 3.11 (for nmp-customizer-tasks) +# ============================================================================= +FROM mamba-wheel-base AS mamba-ssm-23-wheel-builder + +ARG MAMBA_23_COMMIT=v2.3.0 + +RUN mkdir -p /wheels && \ + git clone https://github.com/state-spaces/mamba.git /src/mamba && \ + cd /src/mamba && \ + git checkout ${MAMBA_23_COMMIT} && \ + sed -i "/triton/d" setup.py && \ + sed -i "/triton/d" pyproject.toml && \ + uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/mamba + +# ============================================================================= +# mamba-ssm 2.3.0 wheel — Python 3.12 (for Python 3.12 consumers) +# ============================================================================= +FROM mamba-wheel-base-py312 AS mamba-ssm-23-wheel-builder-py312 + +ARG MAMBA_23_COMMIT=v2.3.0 + +RUN mkdir -p /wheels && \ + git clone https://github.com/state-spaces/mamba.git /src/mamba && \ + cd /src/mamba && \ + git checkout ${MAMBA_23_COMMIT} && \ + sed -i "/triton/d" setup.py && \ + sed -i "/triton/d" pyproject.toml && \ + uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/mamba + + +# ============================================================================= +# mamba-ssm 2.3.0 wheel — Python 3.12 (for Python 3.12 consumers) - Using CUDA 13.1.1 +# ============================================================================= +FROM mamba-wheel-base-py312-cu13.1.1 AS mamba-ssm-23-wheel-builder-py312-cu13.1.1 + +ARG MAMBA_23_COMMIT=v2.3.0 + +RUN mkdir -p /wheels && \ + git clone https://github.com/state-spaces/mamba.git /src/mamba && \ + cd /src/mamba && \ + git checkout ${MAMBA_23_COMMIT} && \ + sed -i "/triton/d" setup.py && \ + sed -i "/triton/d" pyproject.toml && \ + uv build --wheel --no-build-isolation --out-dir=/wheels . && \ + rm -rf /src/mamba + + +# The final mamba-ssm-wheel image contains four versions: +# - mamba_ssm-2.2.5-cp311-*.whl (from MAMBA_22_COMMIT=6b32be06, for nmp-gpu-tasks / Python 3.11) +# - mamba_ssm-2.2.5-cp312-*.whl (from MAMBA_22_COMMIT=6b32be06, for Python 3.12 consumers, e.g. automodel) +# - mamba_ssm-2.3.0-cp311-*.whl (from v2.3.0, for nmp-customizer-tasks / Python 3.11) +# - mamba_ssm-2.3.0-cp312-*.whl (from v2.3.0, for Python 3.12 consumers) +# Consumers must pin both version AND Python tag glob to select the correct wheel. +FROM scratch AS mamba-ssm-wheel +COPY --from=mamba-ssm-wheel-builder /wheels /wheels +COPY --from=mamba-ssm-25-wheel-builder-py312 /wheels /wheels +COPY --from=mamba-ssm-23-wheel-builder /wheels /wheels +COPY --from=mamba-ssm-23-wheel-builder-py312 /wheels /wheels +COPY --from=mamba-ssm-23-wheel-builder-py312-cu13.1.1 /wheels /wheels/cu13.1.1 diff --git a/services/automodel/docker/Dockerfile.nmp-automodel-base b/services/automodel/docker/Dockerfile.nmp-automodel-base new file mode 100644 index 00000000..512b4889 --- /dev/null +++ b/services/automodel/docker/Dockerfile.nmp-automodel-base @@ -0,0 +1,92 @@ +# syntax=docker/dockerfile:1 +# nmp-automodel base - PyTorch NGC image + Automodel + CUDA extension wheels. +# +# Mirrors nmp/docker/Dockerfile.nmp-customizer customizer-automodel-base-builder. +# Publish target: nmp-automodel-base-builder (tags as nmp-automodel-base). + +ARG CAUSAL_CONV1D_WHEEL_IMAGE=local +ARG MAMBA_SSM_WHEEL_IMAGE=local +ARG AUTOMODEL_COMMIT=0e9909f56ba48ef9761fc6f49323ba9d0a0835b2 + +FROM ${CAUSAL_CONV1D_WHEEL_IMAGE} AS causal-conv1d-wheel-src +FROM ${MAMBA_SSM_WHEEL_IMAGE} AS mamba-ssm-wheel-src + +FROM alpine/git AS automodel-clone +ARG AUTOMODEL_COMMIT +RUN git clone --branch main https://github.com/NVIDIA-NeMo/Automodel.git /opt/Automodel && \ + cd /opt/Automodel && \ + git checkout ${AUTOMODEL_COMMIT} && \ + rm -rf /opt/Automodel/.git + +FROM nvcr.io/nvidia/pytorch:26.02-py3 AS nmp-automodel-base-builder + +WORKDIR /opt + +COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /bin/uv + +ENV VIRTUAL_ENV=/opt/venv \ + UV_PROJECT_ENVIRONMENT=/opt/venv \ + UV_LINK_MODE=copy \ + UV_COMPILE_BYTECODE=1 +ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" + +RUN uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages + +COPY --from=automodel-clone /opt/Automodel /opt/Automodel +COPY services/customizer/src/cherry-picks /opt/cherry-picks +RUN cd /opt/Automodel && patch -p1 < /opt/cherry-picks/e6d2930a.diff + +RUN cd /opt/Automodel && \ + bash docker/common/update_pyproject_pytorch.sh /opt/Automodel + +RUN --mount=type=cache,target=/root/.cache/uv \ + cd /opt/Automodel && \ + UV_HTTP_TIMEOUT=120 uv sync --locked --extra all --all-groups + +# Install AFTER Automodel sync - uv sync drops packages not in its lockfile. +RUN --mount=from=causal-conv1d-wheel-src,target=/tmp/causal-conv1d-wheel-src,readonly \ + --mount=from=mamba-ssm-wheel-src,target=/tmp/mamba-ssm-wheel-src,readonly \ + uv pip install --no-cache-dir --no-deps \ + /tmp/causal-conv1d-wheel-src/wheels/cu13.1.1/causal_conv1d-*cp312*.whl \ + /tmp/mamba-ssm-wheel-src/wheels/cu13.1.1/mamba_ssm-2.3.0-cp312*.whl + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --no-build-isolation --no-deps git+https://github.com/fanshiqing/grouped_gemm@v1.1.4 + +RUN --mount=type=cache,target=/root/.cache/uv \ + git clone https://github.com/bitsandbytes-foundation/bitsandbytes.git && \ + cd bitsandbytes && \ + git checkout 0.49.1 && \ + cmake -DCOMPUTE_CAPABILITY="75;80;86;87;89;90;100;103;110;120;121" -DCOMPUTE_BACKEND=cuda -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -S . && \ + make -j"$(nproc)" && \ + uv pip install scikit-build-core --no-deps && \ + uv pip install --no-build-isolation --no-deps --force-reinstall . && \ + uv pip uninstall scikit-build-core + +RUN if [ -f /usr/local/bin/torchrun ]; then \ + sed -i '1c\#!/opt/venv/bin/python' /usr/local/bin/torchrun; \ + fi + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install "hf-transfer>=0.1.8,<0.2" + +RUN --mount=type=cache,target=/root/.cache/uv \ + rm -rf /opt/venv/lib/python*/site-packages/vllm \ + /opt/venv/lib/python*/site-packages/vllm-*.dist-info && \ + uv pip install \ + "black>=26.3.1" \ + "pyasn1>=0.6.3" \ + "onnx>=1.21.0" + +# Published base image (same filesystem as builder). +FROM nvcr.io/nvidia/pytorch:26.02-py3 AS nmp-automodel-base +COPY --from=nmp-automodel-base-builder /opt/venv /opt/venv +COPY --from=nmp-automodel-base-builder /opt/Automodel /opt/Automodel +# Builder pins uv 0.9.14 but does not ship it in the venv layer; PyTorch base may ship 0.10.x. +COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /bin/uv + +ENV VIRTUAL_ENV=/opt/venv \ + UV_PROJECT_ENVIRONMENT=/opt/venv \ + HF_HUB_ENABLE_HF_TRANSFER=1 +ENV PATH="/bin:/opt/venv/bin:/root/.local/bin:$PATH" +WORKDIR /opt diff --git a/services/automodel/docker/Dockerfile.nmp-automodel-tasks b/services/automodel/docker/Dockerfile.nmp-automodel-tasks new file mode 100644 index 00000000..5e42558a --- /dev/null +++ b/services/automodel/docker/Dockerfile.nmp-automodel-tasks @@ -0,0 +1,49 @@ +# syntax=docker/dockerfile:1 +# nmp-automodel tasks - file_io, model_entity, and other platform task steps. +# Built on nmp-automodel-base (GPU-capable; runs on CPU or GPU nodes). + +ARG BASE_TAG_AUTOMODEL=local +ARG BASE_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev +ARG SMOKE_MARKER=smoke_nmp_automodel_tasks + +FROM ${BASE_REGISTRY}/nmp-automodel-base:${BASE_TAG_AUTOMODEL} AS nmp-automodel-base + +FROM nmp-automodel-base AS runtime + +# Pin uv for platform workspace installs (base may lack /bin/uv or PATH may prefer 0.10.x). +COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /bin/uv +ENV PATH="/bin:${PATH}" + +ARG USERNAME=ubuntu +ARG USER_UID=1000 +ARG USER_GID=1000 + +ENV HF_HUB_ENABLE_HF_TRANSFER=1 \ + OTEL_PYTHON_EXCLUDED_URLS="health" + +COPY --from=platform-workspace / /app +WORKDIR /app + +RUN mkdir -p /home/${USERNAME}/.cache && \ + chown -R ${USER_UID}:${USER_GID} /home/${USERNAME} /app/services/automodel + +# /app/pyproject.toml is pyproject.workspace.toml (see Dockerfile.platform-workspace). +# --inexact: keep PyTorch / Automodel packages already in the base venv; add nmp-automodel + deps. +ENV UV_LINK_MODE=copy \ + UV_PROJECT_ENVIRONMENT=${VIRTUAL_ENV} + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --package nmp-automodel --package nmp-models --no-dev --inexact + +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +ENTRYPOINT ["/opt/venv/bin/python"] +CMD ["-m", "nmp.automodel.tasks", "--help"] + +USER ${USER_UID}:${USER_GID} + +FROM runtime AS smoke-test +ARG SMOKE_MARKER +USER 0 +COPY tests/smoke_gpu/ /smoke_test/ +RUN uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache --reinstall pytest && \ + ${VIRTUAL_ENV}/bin/pytest /smoke_test/ -m ${SMOKE_MARKER} -v diff --git a/services/automodel/docker/Dockerfile.nmp-automodel-training b/services/automodel/docker/Dockerfile.nmp-automodel-training new file mode 100644 index 00000000..75a4fcdc --- /dev/null +++ b/services/automodel/docker/Dockerfile.nmp-automodel-training @@ -0,0 +1,54 @@ +# syntax=docker/dockerfile:1 +# nmp-automodel training - GPU finetune step (nemo_automodel recipes + nmp-automodel package). +# Same platform glue as tasks; separate image tag for the compiler training step. + +ARG BASE_TAG_AUTOMODEL=local +ARG BASE_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev +ARG SMOKE_MARKER=smoke_nmp_automodel_training + +FROM ${BASE_REGISTRY}/nmp-automodel-base:${BASE_TAG_AUTOMODEL} AS nmp-automodel-base + +FROM nmp-automodel-base AS runtime + +COPY --from=ghcr.io/astral-sh/uv:0.9.14 /uv /bin/uv +ENV PATH="/bin:${PATH}" + +ARG USERNAME=ubuntu +ARG USER_UID=1000 +ARG USER_GID=1000 + +ENV HF_HUB_ENABLE_HF_TRANSFER=1 \ + OTEL_PYTHON_EXCLUDED_URLS="health" + +COPY --from=platform-workspace / /app +WORKDIR /app + +RUN mkdir -p /home/${USERNAME}/.cache && \ + chown -R ${USER_UID}:${USER_GID} /home/${USERNAME} /app/services/automodel + +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache \ + --overrides /app/services/automodel/docker/no_override_requirements.txt \ + -e /app/sdk/python/nemo-platform \ + -e /app/packages/nemo_platform_plugin \ + -e /app/packages/nmp_common \ + -e /app/services/automodel + +# Re-pin nemo_automodel from the base clone without re-resolving transformers (already in base venv). +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache --no-deps \ + -e /opt/Automodel + + +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +ENTRYPOINT ["/opt/venv/bin/python"] +CMD ["-m", "nmp.automodel.tasks.training", "--help"] + +USER ${USER_UID}:${USER_GID} + +FROM runtime AS smoke-test +ARG SMOKE_MARKER +USER 0 +COPY tests/smoke_gpu/ /smoke_test/ +RUN uv pip install --python ${VIRTUAL_ENV}/bin/python --no-cache --reinstall pytest && \ + ${VIRTUAL_ENV}/bin/pytest /smoke_test/ -m ${SMOKE_MARKER} -v diff --git a/services/automodel/docker/Dockerfile.platform-workspace b/services/automodel/docker/Dockerfile.platform-workspace new file mode 100644 index 00000000..70aeae3f --- /dev/null +++ b/services/automodel/docker/Dockerfile.platform-workspace @@ -0,0 +1,20 @@ +# syntax=docker/dockerfile:1 +# Minimal Platform workspace slice for nmp-automodel container installs. +# Used as a named build context (platform-workspace). +# Keep in sync with services/automodel/docker/pyproject.workspace.toml members. + +FROM scratch AS platform-workspace +# Do not copy repo-root pyproject.toml/uv.lock — they reference the full monorepo workspace. +COPY services/automodel/docker/pyproject.workspace.toml pyproject.toml +# nemo-platform-sdk hatch build force-includes docs/ and mkdocs.yml from repo root. +# docs/api/openapi.yaml is a symlink to ../../openapi/openapi.yaml — copy both. +COPY docs docs +COPY openapi openapi +COPY mkdocs.yml mkdocs.yml +COPY packages/nmp_build_tools packages/nmp_build_tools +COPY packages/models packages/models +COPY packages/nmp_common packages/nmp_common +COPY packages/nemo_platform_plugin packages/nemo_platform_plugin +COPY sdk/python/nemo-platform sdk/python/nemo-platform +COPY services/automodel services/automodel +COPY services/core/models services/core/models diff --git a/services/automodel/docker/README.md b/services/automodel/docker/README.md new file mode 100644 index 00000000..13e0ec2c --- /dev/null +++ b/services/automodel/docker/README.md @@ -0,0 +1,98 @@ +# nmp-automodel container images + +Three images derived from the legacy `nmp` **customizer-automodel** base builder (not the full `customizer-automodel` HTTP service image). Published as flat NVCR repo names under **`nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-*`** (no nested `nmp/...` path — NVCR rejects that on push). + +| Image | Dockerfile | Role | +|-------|------------|------| +| `nmp-automodel-base` | `Dockerfile.nmp-automodel-base` | PyTorch 26.02 + Automodel + `mamba-ssm` / `causal-conv1d` wheels | +| `nmp-automodel-tasks` | `Dockerfile.nmp-automodel-tasks` | Platform task glue (`file_io`, `model_entity`, `model_spec`); GPU-capable base | +| `nmp-automodel-training` | `Dockerfile.nmp-automodel-training` | Training step (`nmp.automodel.tasks.training`) | + +Full references (default tag `local`): + +- `nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-base:local` +- `nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-tasks:local` +- `nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-training:local` + +Bake file: **`docker-bake.automodel.hcl`** at the Platform repo root (`context = "."`). Run all commands from the Platform repo root. + +## `docker buildx bake --print` + +`--print` only parses the HCL and prints JSON. A **0.0s FINISHED** result is normal — no image is built. Use it to verify targets, tags, and platforms before a real build. + +## Prerequisites + +1. **CUDA extension wheels** (`causal-conv1d-wheel`, `mamba-ssm-wheel`) - built from this directory or pulled from NGC. The wheel Dockerfile and uv locks live under `docker/locks/` (ported from `nmp`). + +2. **Base image tag** - after building the base, set `BASE_TAG_AUTOMODEL` (or push to `BASE_REGISTRY`) before building tasks/training. + +## Build wheels and push to NGC (from Platform root) + +```bash +cd /path/to/Platform + +docker login nvcr.io + +export WHEELS_TAG="$(git rev-parse --short HEAD)" +# Bake variables (WHEELS_REGISTRY, WHEELS_TAG, IMAGE_REGISTRY) are overridden via env, not --set. +# Example: +# export WHEELS_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev +# export IMAGE_REGISTRY=nvcr.io/0921617854601259/nemo-platform-dev + +docker buildx bake --print -f docker-bake.automodel.hcl nmp-automodel-gpu-wheels + +docker buildx bake \ + -f docker-bake.automodel.hcl \ + nmp-automodel-gpu-wheels \ + --push \ + --set "*.platform=linux/amd64" +``` + +Override platform: `export BUILD_PLATFORM=linux/amd64` or `--set "*.platform=linux/amd64"`. + +## Build automodel images (from Platform root) + +```bash +cd /path/to/Platform + +export WHEELS_TAG="${WHEELS_TAG:-3fd6986ff173b598446ffac06d9be3f84b482495}" +export BAKE_TAG="${WHEELS_TAG}" + +docker buildx bake \ + -f docker-bake.automodel.hcl \ + nmp-automodel-base-builder \ + --push \ + --set "*.platform=linux/amd64" + +docker buildx bake \ + -f docker-bake.automodel.hcl \ + nmp-automodel \ + --push \ + --set "*.platform=linux/amd64" +``` + +To use wheels already published without rebuilding, `export WHEELS_TAG=` and matching `BAKE_TAG`. + +Override registry: `export WHEELS_REGISTRY=...` and `export IMAGE_REGISTRY=...` before bake. + +## Tasks / training runtime (platform glue) + +**Base (`nmp-automodel-base`):** Same as `customizer-automodel-base-builder` — NGC PyTorch 26.02, Automodel `uv sync --locked`, pinned `transformers`/`torch`. + +**Tasks image:** `uv sync --package nmp-automodel --no-dev --inexact` from the minimal workspace. CPU steps only need platform SDK glue; upgrading ancillary packages here does not affect training. + +**Training image:** Do **not** use `uv sync` — it upgrades `transformers` and breaks `PreTrainedModel`. Use **`uv pip install -e`** with **`--overrides no_override_requirements.txt`** (customizer pattern), then `uv pip install --no-deps -e /opt/Automodel` to re-pin `nemo_automodel` from the base clone (not PyPI). + +## Runtime + +Entrypoint is `/opt/venv/bin/python`. Job steps pass `-m nmp.automodel.tasks.` (see `nmp.automodel.app.jobs.compiler`). Local smoke: + +```bash +# No extra args → uses image CMD (python -m nmp.automodel.tasks --help). +docker run --rm $NMP_AUTOMODEL_TASKS_IMAGE + +# Extra args replace CMD; include -m nmp.automodel.tasks or you get plain `python --help`. +docker run --rm $NMP_AUTOMODEL_TASKS_IMAGE -m nmp.automodel.tasks --list +``` + +The job compiler resolves `nmp-automodel-tasks` and `nmp-automodel-training` under `NMP_AUTOMODEL_IMAGE_REGISTRY` (default `nvcr.io/0921617854601259/nemo-platform-dev`). See `nmp.automodel.images`. diff --git a/services/automodel/docker/docker-bake.hcl b/services/automodel/docker/docker-bake.hcl new file mode 100644 index 00000000..47cb2b0c --- /dev/null +++ b/services/automodel/docker/docker-bake.hcl @@ -0,0 +1,4 @@ +# Moved to Platform repo root (same pattern as nmp/docker-bake.hcl): +# docker buildx bake -f docker-bake.automodel.hcl +# +# Context is "." (repo root when run from Platform/). Do not use ../../.. here. diff --git a/services/automodel/docker/locks/README.md b/services/automodel/docker/locks/README.md new file mode 100644 index 00000000..1dbcfdc3 --- /dev/null +++ b/services/automodel/docker/locks/README.md @@ -0,0 +1,11 @@ +# Mamba / causal-conv1d wheel build locks + +Copied from `nmp/docker/locks/` for building `causal-conv1d-wheel` and `mamba-ssm-wheel` images from the Platform repo (see `Dockerfile.mamba-wheel` and `docker-bake.automodel.hcl` group `nmp-automodel-gpu-wheels`). + +To refresh locks after dependency changes: + +```bash +cd /path/to/Platform +uv lock --project services/automodel/docker/locks/mamba-wheel-build-py311 --python 3.11 +uv lock --project services/automodel/docker/locks/mamba-wheel-build-py312 --python 3.12 +``` diff --git a/services/automodel/docker/locks/mamba-wheel-build-py311/pyproject.toml b/services/automodel/docker/locks/mamba-wheel-build-py311/pyproject.toml new file mode 100644 index 00000000..275dc68f --- /dev/null +++ b/services/automodel/docker/locks/mamba-wheel-build-py311/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "mamba-wheel-build-py311" +version = "0.0.0" +requires-python = ">=3.11,<3.12" +dependencies = [ + "packaging", + "setuptools", + "wheel", + "torch==2.10.0+cu128; sys_platform == 'linux'", +] + +[tool.uv] +required-version = ">=0.9.14,<0.10.0" +prerelease = "if-necessary-or-explicit" +index-strategy = "unsafe-best-match" +environments = [ + "sys_platform == 'linux' and platform_machine == 'x86_64'", + "sys_platform == 'linux' and platform_machine == 'aarch64'", +] + +[tool.uv.sources] +torch = { index = "pytorch-cu128", marker = "sys_platform == 'linux'" } + +[[tool.uv.index]] +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true diff --git a/services/automodel/docker/locks/mamba-wheel-build-py311/uv.lock b/services/automodel/docker/locks/mamba-wheel-build-py311/uv.lock new file mode 100644 index 00000000..68684673 --- /dev/null +++ b/services/automodel/docker/locks/mamba-wheel-build-py311/uv.lock @@ -0,0 +1,355 @@ +version = 1 +revision = 3 +requires-python = "==3.11.*" +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", +] +supported-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", +] + +[[package]] +name = "cuda-bindings" +version = "12.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cuda-pathfinder", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/a9/2b/ebcbb60aa6dba830474cd360c42e10282f7a343c0a1f58d24fbd3b7c2d77/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a6a429dc6c13148ff1e27c44f40a3dd23203823e637b87fd0854205195988306", size = 11840604, upload-time = "2025-10-21T14:51:34.565Z" }, + { url = "https://files.pythonhosted.org/packages/45/e7/b47792cc2d01c7e1d37c32402182524774dadd2d26339bd224e0e913832e/cuda_bindings-12.9.4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c912a3d9e6b6651853eed8eed96d6800d69c08e94052c292fec3f282c5a817c9", size = 12210593, upload-time = "2025-10-21T14:51:36.574Z" }, +] + +[[package]] +name = "cuda-pathfinder" +version = "1.5.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/f9/1b9b60a30fc463c14cdea7a77228131a0ccc89572e8df9cb86c9648271ab/cuda_pathfinder-1.5.2-py3-none-any.whl", hash = "sha256:0c5f160a7756c5b072723cbbd6d861e38917ef956c68150b02f0b6e9271c71fa", size = 49988, upload-time = "2026-04-06T23:01:05.17Z" }, +] + +[[package]] +name = "filelock" +version = "3.25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694", size = 40480, upload-time = "2026-03-11T20:45:38.487Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" }, +] + +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "mamba-wheel-build-py311" +version = "0.0.0" +source = { virtual = "." } +dependencies = [ + { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "torch", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "wheel", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] + +[package.metadata] +requires-dist = [ + { name = "packaging" }, + { name = "setuptools" }, + { name = "torch", marker = "sys_platform == 'linux'", specifier = "==2.10.0+cu128", index = "https://download.pytorch.org/whl/cu128" }, + { name = "wheel" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" }, + { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" }, + { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" }, + { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" }, + { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + +[[package]] +name = "nvidia-cublas-cu12" +version = "12.8.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" }, + { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" }, + { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.10.2.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, + { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.3.83" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, + { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.13.1.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, + { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.9.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" }, + { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.3.90" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, + { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.8.93" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" }, + { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.27.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" }, + { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, + { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15", size = 138936938, upload-time = "2025-09-06T00:32:05.589Z" }, + { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" }, + { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "setuptools" +version = "82.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316, upload-time = "2026-03-09T12:47:17.221Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223, upload-time = "2026-03-09T12:47:15.026Z" }, +] + +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + +[[package]] +name = "torch" +version = "2.10.0+cu128" +source = { registry = "https://download.pytorch.org/whl/cu128" } +dependencies = [ + { name = "cuda-bindings", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "filelock", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cuda-cupti-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cuda-runtime-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cudnn-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cufft-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cufile-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-curand-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusolver-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusparselt-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nccl-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvtx-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "triton", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:85ed7944655ea6fd69377692e9cbfd7bba28d99696ceae79985e7caa99cf0a95" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1d01ffaebf64715c0f507a39463149cb19e596ff702bd4bcf862601f2881dabc" }, +] + +[[package]] +name = "triton" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651", size = 176081190, upload-time = "2026-01-20T16:16:00.523Z" }, + { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "wheel" +version = "0.46.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/24/a2eb353a6edac9a0303977c4cb048134959dd2a51b48a269dfc9dde00c8a/wheel-0.46.3.tar.gz", hash = "sha256:e3e79874b07d776c40bd6033f8ddf76a7dad46a7b8aa1b2787a83083519a1803", size = 60605, upload-time = "2026-01-22T12:39:49.136Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/22/b76d483683216dde3d67cba61fb2444be8d5be289bf628c13fc0fd90e5f9/wheel-0.46.3-py3-none-any.whl", hash = "sha256:4b399d56c9d9338230118d705d9737a2a468ccca63d5e813e2a4fc7815d8bc4d", size = 30557, upload-time = "2026-01-22T12:39:48.099Z" }, +] diff --git a/services/automodel/docker/locks/mamba-wheel-build-py312/pyproject.toml b/services/automodel/docker/locks/mamba-wheel-build-py312/pyproject.toml new file mode 100644 index 00000000..49aecc29 --- /dev/null +++ b/services/automodel/docker/locks/mamba-wheel-build-py312/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "mamba-wheel-build-py312" +version = "0.0.0" +requires-python = ">=3.12,<3.13" +dependencies = [ + "packaging", + "setuptools", + "wheel", + "torch==2.10.0+cu128; sys_platform == 'linux'", +] + +[tool.uv] +required-version = ">=0.9.14,<0.10.0" +prerelease = "if-necessary-or-explicit" +index-strategy = "unsafe-best-match" +environments = [ + "sys_platform == 'linux' and platform_machine == 'x86_64'", + "sys_platform == 'linux' and platform_machine == 'aarch64'", +] + +[tool.uv.sources] +torch = { index = "pytorch-cu128", marker = "sys_platform == 'linux'" } + +[[tool.uv.index]] +name = "pytorch-cu128" +url = "https://download.pytorch.org/whl/cu128" +explicit = true diff --git a/services/automodel/docker/locks/mamba-wheel-build-py312/uv.lock b/services/automodel/docker/locks/mamba-wheel-build-py312/uv.lock new file mode 100644 index 00000000..6d8bec6d --- /dev/null +++ b/services/automodel/docker/locks/mamba-wheel-build-py312/uv.lock @@ -0,0 +1,356 @@ +version = 1 +revision = 3 +requires-python = "==3.12.*" +resolution-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", +] +supported-markers = [ + "platform_machine == 'x86_64' and sys_platform == 'linux'", + "platform_machine == 'aarch64' and sys_platform == 'linux'", +] + +[[package]] +name = "cuda-bindings" +version = "12.9.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cuda-pathfinder", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c2/65bfd79292b8ff18be4dd7f7442cea37bcbc1a228c1886f1dea515c45b67/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:694ba35023846625ef471257e6b5a4bc8af690f961d197d77d34b1d1db393f56", size = 11760260, upload-time = "2025-10-21T14:51:40.79Z" }, + { url = "https://files.pythonhosted.org/packages/a9/c1/dabe88f52c3e3760d861401bb994df08f672ec893b8f7592dc91626adcf3/cuda_bindings-12.9.4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fda147a344e8eaeca0c6ff113d2851ffca8f7dfc0a6c932374ee5c47caa649c8", size = 12151019, upload-time = "2025-10-21T14:51:43.167Z" }, +] + +[[package]] +name = "cuda-pathfinder" +version = "1.5.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f2/f9/1b9b60a30fc463c14cdea7a77228131a0ccc89572e8df9cb86c9648271ab/cuda_pathfinder-1.5.2-py3-none-any.whl", hash = "sha256:0c5f160a7756c5b072723cbbd6d861e38917ef956c68150b02f0b6e9271c71fa", size = 49988, upload-time = "2026-04-06T23:01:05.17Z" }, +] + +[[package]] +name = "filelock" +version = "3.25.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694", size = 40480, upload-time = "2026-03-11T20:45:38.487Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" }, +] + +[[package]] +name = "fsspec" +version = "2026.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e6/ab/fb21f4c939bb440104cc2b396d3be1d9b7a9fd3c6c2a53d98c45b3d7c954/fsspec-2026.2.0-py3-none-any.whl", hash = "sha256:98de475b5cb3bd66bedd5c4679e87b4fdfe1a3bf4d707b151b3c07e58c9a2437", size = 202505, upload-time = "2026-02-05T21:50:51.819Z" }, +] + +[[package]] +name = "jinja2" +version = "3.1.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "markupsafe", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, +] + +[[package]] +name = "mamba-wheel-build-py312" +version = "0.0.0" +source = { virtual = "." } +dependencies = [ + { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "torch", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "wheel", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] + +[package.metadata] +requires-dist = [ + { name = "packaging" }, + { name = "setuptools" }, + { name = "torch", marker = "sys_platform == 'linux'", specifier = "==2.10.0+cu128", index = "https://download.pytorch.org/whl/cu128" }, + { name = "wheel" }, +] + +[[package]] +name = "markupsafe" +version = "3.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" }, + { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" }, + { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" }, + { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" }, + { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" }, + { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" }, +] + +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + +[[package]] +name = "networkx" +version = "3.6.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" }, +] + +[[package]] +name = "nvidia-cublas-cu12" +version = "12.8.4.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/29/99/db44d685f0e257ff0e213ade1964fc459b4a690a73293220e98feb3307cf/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:b86f6dd8935884615a0683b663891d43781b819ac4f2ba2b0c9604676af346d0", size = 590537124, upload-time = "2025-03-07T01:43:53.556Z" }, + { url = "https://files.pythonhosted.org/packages/dc/61/e24b560ab2e2eaeb3c839129175fb330dfcfc29e5203196e5541a4c44682/nvidia_cublas_cu12-12.8.4.1-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:8ac4e771d5a348c551b2a426eda6193c19aa630236b418086020df5ba9667142", size = 594346921, upload-time = "2025-03-07T01:44:31.254Z" }, +] + +[[package]] +name = "nvidia-cuda-cupti-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d5/1f/b3bd73445e5cb342727fd24fe1f7b748f690b460acadc27ea22f904502c8/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4412396548808ddfed3f17a467b104ba7751e6b58678a4b840675c56d21cf7ed", size = 9533318, upload-time = "2025-03-07T01:40:10.421Z" }, + { url = "https://files.pythonhosted.org/packages/f8/02/2adcaa145158bf1a8295d83591d22e4103dbfd821bcaf6f3f53151ca4ffa/nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ea0cb07ebda26bb9b29ba82cda34849e73c166c18162d3913575b0c9db9a6182", size = 10248621, upload-time = "2025-03-07T01:40:21.213Z" }, +] + +[[package]] +name = "nvidia-cuda-nvrtc-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/05/6b/32f747947df2da6994e999492ab306a903659555dddc0fbdeb9d71f75e52/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:a7756528852ef889772a84c6cd89d41dfa74667e24cca16bb31f8f061e3e9994", size = 88040029, upload-time = "2025-03-07T01:42:13.562Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d1/e50d0acaab360482034b84b6e27ee83c6738f7d32182b987f9c7a4e32962/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fc1fec1e1637854b4c0a65fb9a8346b51dd9ee69e61ebaccc82058441f15bce8", size = 43106076, upload-time = "2025-03-07T01:41:59.817Z" }, +] + +[[package]] +name = "nvidia-cuda-runtime-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7c/75/f865a3b236e4647605ea34cc450900854ba123834a5f1598e160b9530c3a/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:52bf7bbee900262ffefe5e9d5a2a69a30d97e2bc5bb6cc866688caa976966e3d", size = 965265, upload-time = "2025-03-07T01:39:43.533Z" }, + { url = "https://files.pythonhosted.org/packages/0d/9b/a997b638fcd068ad6e4d53b8551a7d30fe8b404d6f1804abf1df69838932/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:adade8dcbd0edf427b7204d480d6066d33902cab2a4707dcfc48a2d0fd44ab90", size = 954765, upload-time = "2025-03-07T01:40:01.615Z" }, +] + +[[package]] +name = "nvidia-cudnn-cu12" +version = "9.10.2.21" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, + { url = "https://files.pythonhosted.org/packages/ba/51/e123d997aa098c61d029f76663dedbfb9bc8dcf8c60cbd6adbe42f76d049/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:949452be657fa16687d0930933f032835951ef0892b37d2d53824d1a84dc97a8", size = 706758467, upload-time = "2025-06-06T21:54:08.597Z" }, +] + +[[package]] +name = "nvidia-cufft-cu12" +version = "11.3.3.83" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, + { url = "https://files.pythonhosted.org/packages/1f/13/ee4e00f30e676b66ae65b4f08cb5bcbb8392c03f54f2d5413ea99a5d1c80/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4d2dd21ec0b88cf61b62e6b43564355e5222e4a3fb394cac0db101f2dd0d4f74", size = 193118695, upload-time = "2025-03-07T01:45:27.821Z" }, +] + +[[package]] +name = "nvidia-cufile-cu12" +version = "1.13.1.3" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/fe/1bcba1dfbfb8d01be8d93f07bfc502c93fa23afa6fd5ab3fc7c1df71038a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1d069003be650e131b21c932ec3d8969c1715379251f8d23a1860554b1cb24fc", size = 1197834, upload-time = "2025-03-07T01:45:50.723Z" }, + { url = "https://files.pythonhosted.org/packages/1e/f5/5607710447a6fe9fd9b3283956fceeee8a06cda1d2f56ce31371f595db2a/nvidia_cufile_cu12-1.13.1.3-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:4beb6d4cce47c1a0f1013d72e02b0994730359e17801d395bdcbf20cfb3bb00a", size = 1120705, upload-time = "2025-03-07T01:45:41.434Z" }, +] + +[[package]] +name = "nvidia-curand-cu12" +version = "10.3.9.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/45/5e/92aa15eca622a388b80fbf8375d4760738df6285b1e92c43d37390a33a9a/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:dfab99248034673b779bc6decafdc3404a8a6f502462201f2f31f11354204acd", size = 63625754, upload-time = "2025-03-07T01:46:10.735Z" }, + { url = "https://files.pythonhosted.org/packages/fb/aa/6584b56dc84ebe9cf93226a5cde4d99080c8e90ab40f0c27bda7a0f29aa1/nvidia_curand_cu12-10.3.9.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:b32331d4f4df5d6eefa0554c565b626c7216f87a06a4f56fab27c3b68a830ec9", size = 63619976, upload-time = "2025-03-07T01:46:23.323Z" }, +] + +[[package]] +name = "nvidia-cusolver-cu12" +version = "11.7.3.90" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, + { url = "https://files.pythonhosted.org/packages/85/48/9a13d2975803e8cf2777d5ed57b87a0b6ca2cc795f9a4f59796a910bfb80/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_x86_64.whl", hash = "sha256:4376c11ad263152bd50ea295c05370360776f8c3427b30991df774f9fb26c450", size = 267506905, upload-time = "2025-03-07T01:47:16.273Z" }, +] + +[[package]] +name = "nvidia-cusparse-cu12" +version = "12.5.8.93" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f5/e1854cb2f2bcd4280c44736c93550cc300ff4b8c95ebe370d0aa7d2b473d/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ec05d76bbbd8b61b06a80e1eaf8cf4959c3d4ce8e711b65ebd0443bb0ebb13b", size = 288216466, upload-time = "2025-03-07T01:48:13.779Z" }, +] + +[[package]] +name = "nvidia-cusparselt-cu12" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/b9/598f6ff36faaece4b3c50d26f50e38661499ff34346f00e057760b35cc9d/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_aarch64.whl", hash = "sha256:8878dce784d0fac90131b6817b607e803c36e629ba34dc5b433471382196b6a5", size = 283835557, upload-time = "2025-02-26T00:16:54.265Z" }, + { url = "https://files.pythonhosted.org/packages/56/79/12978b96bd44274fe38b5dde5cfb660b1d114f70a65ef962bcbbed99b549/nvidia_cusparselt_cu12-0.7.1-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f1bb701d6b930d5a7cea44c19ceb973311500847f81b634d802b7b539dc55623", size = 287193691, upload-time = "2025-02-26T00:15:44.104Z" }, +] + +[[package]] +name = "nvidia-nccl-cu12" +version = "2.27.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/1c/857979db0ef194ca5e21478a0612bcdbbe59458d7694361882279947b349/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:31432ad4d1fb1004eb0c56203dc9bc2178a1ba69d1d9e02d64a6938ab5e40e7a", size = 322400625, upload-time = "2025-06-26T04:11:04.496Z" }, + { url = "https://files.pythonhosted.org/packages/6e/89/f7a07dc961b60645dbbf42e80f2bc85ade7feb9a491b11a1e973aa00071f/nvidia_nccl_cu12-2.27.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457", size = 322348229, upload-time = "2025-06-26T04:11:28.385Z" }, +] + +[[package]] +name = "nvidia-nvjitlink-cu12" +version = "12.8.93" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f6/74/86a07f1d0f42998ca31312f998bd3b9a7eff7f52378f4f270c8679c77fb9/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:81ff63371a7ebd6e6451970684f916be2eab07321b73c9d244dc2b4da7f73b88", size = 39254836, upload-time = "2025-03-07T01:49:55.661Z" }, + { url = "https://files.pythonhosted.org/packages/2a/a2/8cee5da30d13430e87bf99bb33455d2724d0a4a9cb5d7926d80ccb96d008/nvidia_nvjitlink_cu12-12.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:adccd7161ace7261e01bb91e44e88da350895c270d23f744f0820c818b7229e7", size = 38386204, upload-time = "2025-03-07T01:49:43.612Z" }, +] + +[[package]] +name = "nvidia-nvshmem-cu12" +version = "3.4.5" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1d/6a/03aa43cc9bd3ad91553a88b5f6fb25ed6a3752ae86ce2180221962bc2aa5/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0b48363fc6964dede448029434c6abed6c5e37f823cb43c3bcde7ecfc0457e15", size = 138936938, upload-time = "2025-09-06T00:32:05.589Z" }, + { url = "https://files.pythonhosted.org/packages/b5/09/6ea3ea725f82e1e76684f0708bbedd871fc96da89945adeba65c3835a64c/nvidia_nvshmem_cu12-3.4.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:042f2500f24c021db8a06c5eec2539027d57460e1c1a762055a6554f72c369bd", size = 139103095, upload-time = "2025-09-06T00:32:31.266Z" }, +] + +[[package]] +name = "nvidia-nvtx-cu12" +version = "12.8.90" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/10/c0/1b303feea90d296f6176f32a2a70b5ef230f9bdeb3a72bddb0dc922dc137/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d7ad891da111ebafbf7e015d34879f7112832fc239ff0d7d776b6cb685274615", size = 91161, upload-time = "2025-03-07T01:42:23.922Z" }, + { url = "https://files.pythonhosted.org/packages/a2/eb/86626c1bbc2edb86323022371c39aa48df6fd8b0a1647bc274577f72e90b/nvidia_nvtx_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5b17e2001cc0d751a5bc2c6ec6d26ad95913324a4adb86788c944f8ce9ba441f", size = 89954, upload-time = "2025-03-07T01:42:44.131Z" }, +] + +[[package]] +name = "packaging" +version = "26.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" }, +] + +[[package]] +name = "setuptools" +version = "82.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4f/db/cfac1baf10650ab4d1c111714410d2fbb77ac5a616db26775db562c8fab2/setuptools-82.0.1.tar.gz", hash = "sha256:7d872682c5d01cfde07da7bccc7b65469d3dca203318515ada1de5eda35efbf9", size = 1152316, upload-time = "2026-03-09T12:47:17.221Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9d/76/f789f7a86709c6b087c5a2f52f911838cad707cc613162401badc665acfe/setuptools-82.0.1-py3-none-any.whl", hash = "sha256:a59e362652f08dcd477c78bb6e7bd9d80a7995bc73ce773050228a348ce2e5bb", size = 1006223, upload-time = "2026-03-09T12:47:15.026Z" }, +] + +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + +[[package]] +name = "torch" +version = "2.10.0+cu128" +source = { registry = "https://download.pytorch.org/whl/cu128" } +dependencies = [ + { name = "cuda-bindings", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "filelock", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "fsspec", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "jinja2", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "networkx", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cuda-cupti-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cuda-nvrtc-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cuda-runtime-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cudnn-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cufft-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cufile-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-curand-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusolver-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-cusparselt-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nccl-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "nvidia-nvtx-cu12", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "setuptools", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "sympy", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "triton", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "typing-extensions", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +wheels = [ + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6f09cdf2415516be028ae82e6b985bcfc3eac37bc52ab401142689f6224516ca" }, + { url = "https://download-r2.pytorch.org/whl/cu128/torch-2.10.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:628e89bd5110ced7debee2a57c69959725b7fbc64eab81a39dd70e46c7e28ba5" }, +] + +[[package]] +name = "triton" +version = "3.6.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" }, +] + +[[package]] +name = "typing-extensions" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "wheel" +version = "0.46.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "packaging", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/89/24/a2eb353a6edac9a0303977c4cb048134959dd2a51b48a269dfc9dde00c8a/wheel-0.46.3.tar.gz", hash = "sha256:e3e79874b07d776c40bd6033f8ddf76a7dad46a7b8aa1b2787a83083519a1803", size = 60605, upload-time = "2026-01-22T12:39:49.136Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/87/22/b76d483683216dde3d67cba61fb2444be8d5be289bf628c13fc0fd90e5f9/wheel-0.46.3-py3-none-any.whl", hash = "sha256:4b399d56c9d9338230118d705d9737a2a468ccca63d5e813e2a4fc7815d8bc4d", size = 30557, upload-time = "2026-01-22T12:39:48.099Z" }, +] diff --git a/services/automodel/docker/no_override_requirements.txt b/services/automodel/docker/no_override_requirements.txt new file mode 100644 index 00000000..03482e62 --- /dev/null +++ b/services/automodel/docker/no_override_requirements.txt @@ -0,0 +1,10 @@ +# Preserve ML stack from nmp-automodel-base (Automodel uv sync on NGC PyTorch). +# Same pattern as customizer NO_OVERRIDE_REQUIREMENTS_PATH: impossible marker blocks +# install/upgrade so the base venv pins remain intact when adding platform glue. +transformers; sys_platform == 'never' +torch; sys_platform == 'never' +torchvision; sys_platform == 'never' +tokenizers; sys_platform == 'never' +accelerate; sys_platform == 'never' +safetensors; sys_platform == 'never' +numpy; sys_platform == 'never' diff --git a/services/automodel/docker/pyproject.workspace.toml b/services/automodel/docker/pyproject.workspace.toml new file mode 100644 index 00000000..b63b105f --- /dev/null +++ b/services/automodel/docker/pyproject.workspace.toml @@ -0,0 +1,31 @@ +# Minimal uv workspace for nmp-automodel container image builds only. +# Replaces the repo-root pyproject.toml in Dockerfile.platform-workspace so +# partial COPY trees are not validated against the full monorepo workspace. + +[project] +name = "nemo-platform-automodel-image" +version = "0.0.0" +requires-python = ">=3.11,<3.14" + +[tool.uv] +required-version = ">=0.9.14,<0.10.0" + +[tool.uv.workspace] +members = [ + "packages/nmp_build_tools", + "packages/models", + "sdk/python/nemo-platform", + "packages/nemo_platform_plugin", + "packages/nmp_common", + "services/automodel", + "services/core/models", +] + +[tool.uv.sources] +nmp-build-tools = { workspace = true } +models = { workspace = true } +nemo-platform-sdk = { workspace = true } +nemo-platform-plugin = { workspace = true } +nmp-common = { workspace = true } +nmp-automodel = { workspace = true } +nmp-models = { workspace = true } diff --git a/services/automodel/pyproject.toml b/services/automodel/pyproject.toml new file mode 100644 index 00000000..fbc3eed9 --- /dev/null +++ b/services/automodel/pyproject.toml @@ -0,0 +1,40 @@ +[project] +name = "nmp-automodel" +version = "0.1.0" +description = "NeMo Automodel job compiler and platform tasks (no HTTP server)." +readme = "README.md" +requires-python = ">=3.11,<3.14" +dependencies = [ + "nmp-common", + "nemo-platform-sdk", + "pydantic>=2.10.6", + "pydantic-settings>=2.6.1", + "httpx>=0.27.0", + "aiofiles>=24.1.0", + "tenacity>=8.5.0", + "jsonschema>=4.23.0", +] + +[project.optional-dependencies] +dev = ["pytest>=8.3.4", "pytest-asyncio>=0.25.3", "pytest-mock>=3.14.0"] + +[project.scripts] +nmp-automodel-file-io = "nmp.automodel.tasks.file_io:run" +nmp-automodel-training = "nmp.automodel.tasks.training.__main__:run" +nmp-automodel-model-entity = "nmp.automodel.tasks.model_entity.__main__:run" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/nmp"] + +[tool.uv.sources] +nmp-common = { workspace = true } +nemo-platform-sdk = { workspace = true } + +[tool.pytest.ini_options] +asyncio_mode = "auto" +pythonpath = ["src"] +testpaths = ["tests"] diff --git a/services/automodel/src/nmp/automodel/__init__.py b/services/automodel/src/nmp/automodel/__init__.py new file mode 100644 index 00000000..26065562 --- /dev/null +++ b/services/automodel/src/nmp/automodel/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""NeMo Automodel platform compiler and tasks.""" diff --git a/services/automodel/src/nmp/automodel/adapter.py b/services/automodel/src/nmp/automodel/adapter.py new file mode 100644 index 00000000..ccb3bfa7 --- /dev/null +++ b/services/automodel/src/nmp/automodel/adapter.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Convert plugin ``AutomodelJobOutput`` shape to legacy ``CustomizationJobOutput`` for the compiler.""" + +from __future__ import annotations + +from typing import Any, Literal + +from nmp.automodel.api.v2.jobs.schemas import ( + CustomizationJobOutput, + DistillationTraining, + IntegrationParams, + LoRAParams, + OutputResponse, + ParallelismParams, + SFTTraining, + WandBParams, +) +from nmp.common.api.common import SecretRef + + +def _map_finetuning_type(value: str) -> str: + if value == "all_weights": + return "all_weights" + if value == "lora_merged": + return "lora_merged" + return "lora" + + +def _build_peft(training: dict[str, Any]) -> LoRAParams | None: + ft = training.get("finetuning_type", "lora") + if ft == "all_weights": + return None + lora = training.get("lora") or {} + return LoRAParams( + rank=lora.get("rank", 16), + alpha=lora.get("alpha", 32), + merge=ft == "lora_merged" or lora.get("merge", False), + target_modules=lora.get("target_modules"), + ) + + +def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTraining: + training = spec["training"] + schedule = spec.get("schedule") or {} + batch = spec.get("batch") or {} + optimizer = spec.get("optimizer") or {} + parallelism = spec.get("parallelism") or {} + + common: dict[str, Any] = { + "peft": _build_peft(training), + "learning_rate": optimizer.get("learning_rate", 1e-4), + "weight_decay": optimizer.get("weight_decay", 0.01), + "warmup_steps": optimizer.get("warmup_steps", 0), + "epochs": schedule.get("epochs", 1), + "max_steps": schedule.get("max_steps"), + "val_check_interval": schedule.get("val_check_interval"), + "batch_size": batch.get("global_batch_size", 8), + "micro_batch_size": batch.get("micro_batch_size", 1), + "sequence_packing": batch.get("sequence_packing", False), + "max_seq_length": training.get("max_seq_length", 2048), + "seed": schedule.get("seed"), + "parallelism": ParallelismParams( + num_nodes=parallelism.get("num_nodes", 1), + num_gpus_per_node=parallelism.get("num_gpus_per_node", 1), + tensor_parallel_size=parallelism.get("tensor_parallel_size", 1), + pipeline_parallel_size=parallelism.get("pipeline_parallel_size", 1), + context_parallel_size=parallelism.get("context_parallel_size", 1), + expert_parallel_size=parallelism.get("expert_parallel_size"), + ), + "execution_profile": training.get("execution_profile"), + } + + training_type: Literal["sft", "distillation"] = training.get("training_type", "sft") + if training_type == "distillation": + return DistillationTraining( + **common, + teacher_model=training["teacher_model"], + teacher_precision=training.get("teacher_precision", "bf16"), + distillation_ratio=training.get("distillation_ratio", 0.5), + distillation_temperature=training.get("distillation_temperature", 1.0), + ) + return SFTTraining(**common) + + +def _build_integrations(spec: dict[str, Any]) -> IntegrationParams | None: + raw = spec.get("integrations") + if not raw: + return None + wandb = raw.get("wandb") + wandb_params = None + if wandb: + secret = wandb.get("api_key_secret") + wandb_params = WandBParams( + project=wandb.get("project"), + api_key_secret=SecretRef(secret) if isinstance(secret, str) else secret, + ) + return IntegrationParams(wandb=wandb_params, mlflow=raw.get("mlflow")) + + +def automodel_spec_to_compiler_output(spec: dict[str, Any] | Any) -> CustomizationJobOutput: + """Map simplified Automodel job output (plugin schema) to ``CustomizationJobOutput``.""" + if hasattr(spec, "model_dump"): + data = spec.model_dump(mode="python") + else: + data = dict(spec) + + dataset = data["dataset"] + training_uri = dataset["training"] if isinstance(dataset, dict) else dataset + + output = data["output"] + if isinstance(output, dict): + out_type = output.get("type", "model") + output_resp = OutputResponse( + name=output["name"], + type=out_type, + fileset=output["fileset"], + description=output.get("description"), + ) + else: + output_resp = output + + return CustomizationJobOutput( + name=data.get("name"), + model=data["model"], + dataset=training_uri, + training=_build_training_block(data), + integrations=_build_integrations(data), + deployment_config=None, + output=output_resp, + ) diff --git a/services/automodel/src/nmp/automodel/api/__init__.py b/services/automodel/src/nmp/automodel/api/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/automodel/src/nmp/automodel/api/v2/__init__.py b/services/automodel/src/nmp/automodel/api/v2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/automodel/src/nmp/automodel/api/v2/jobs/__init__.py b/services/automodel/src/nmp/automodel/api/v2/jobs/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py new file mode 100644 index 00000000..31a11581 --- /dev/null +++ b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py @@ -0,0 +1,639 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""API schemas for customization job endpoints.""" + +from typing import Annotated, Any, Dict, Literal, Optional, Self, Union + +from nmp.automodel.entities.validators import validate_fileset_uri +from nmp.automodel.entities.values import FinetuningType, OutputNameType, Precision +from nmp.common.api.common import SecretRef +from nmp.common.entities.constants import ( + MAX_LENGTH_255, + REGEX_WORD_CHARACTER_DOT_DASH, +) +from pydantic import AfterValidator, BaseModel, ConfigDict, Discriminator, Field, model_validator + +# Important!!! Do not import Pydantic models from this file into tasks. +# Instead, duplicate models from this file into corresponding task module schemas.py. + + +class ValidationError(ValueError): + """Raised when job input validation fails.""" + + pass + + +# ============================================================ +# Sub-Configurations +# ============================================================ + + +class QuantizationParams(BaseModel): + """Base model quantization for memory-efficient PEFT training. + + Supports two scenarios: + - Full-precision base model: quantized on-the-fly at load time + - Pre-quantized base model: loaded directly at the specified precision + + In both cases, base model weights are frozen and only the PEFT adapter + parameters are trained in full precision. + """ + + precision: Literal["4bit", "8bit"] = Field( + default="4bit", + description="Quantization precision. '4bit' (NF4) for maximum memory savings, " + "'8bit' (LLM.int8) for a balance of quality and memory.", + ) + + +class _PEFTParams(BaseModel): + """Base configuration shared by all PEFT methods.""" + + # Quantization only makes sense with PEFT (quantized base weights are frozen, so you need trainable + # adapter parameters), which is why it lives here rather than on _TrainingBase. + quantization: Optional[QuantizationParams] = Field( + default=None, + description="Enable quantized training to reduce GPU memory. " + "If the base model is full-precision, it will be quantized at load time. " + "If the base model is already pre-quantized, this configures the expected precision. " + "The trained adapter remains full-precision.", + ) + + +class LoRAParams(_PEFTParams): + """LoRA adapter configuration.""" + + type: Literal["lora"] = "lora" + + rank: int = Field( + default=8, + ge=1, + le=256, + description="LoRA rank (low-rank dimension). Higher values increase capacity but use more memory.", + ) + alpha: int = Field( + default=32, + ge=1, + description="LoRA alpha scaling factor. Common practice: alpha = 2-4x rank.", + ) + dropout: float = Field( + default=0.0, + ge=0.0, + le=1.0, + description="LoRA dropout probability for regularization.", + ) + target_modules: Optional[list[str]] = Field( + default=None, + description="Module name patterns to apply LoRA to (e.g., ['*.q_proj', '*.v_proj']). " + "If not set, applies to all '*proj' linear layers.", + ) + merge: bool = Field( + default=False, + description="Merge LoRA weights into base model after training. " + "Produces a full-weight checkpoint instead of an adapter.", + ) + use_dora: bool = Field( + default=False, + description="Enable DoRA (Weight-Decomposed Low-Rank Adaptation). " + "Decomposes weight updates into magnitude and direction components. " + "Can improve quality especially at low ranks, but adds training overhead.", + ) + + @model_validator(mode="after") + def _validate_unsupported_features(self) -> Self: + if self.quantization is not None: + raise ValueError("Quantized LoRA training is not yet supported.") + if self.use_dora: + raise ValueError("DoRA is not yet supported.") + return self + + +# When a second PEFT method is added (e.g., IA3Config), change this to: +# PeftMethod = Annotated[Union[LoRAParams, IA3Config], Discriminator("type")] +PeftMethod = LoRAParams + + +class ParallelismParams(BaseModel): + """Distributed training parallelism configuration. + + Most users only need num_gpus_per_node. Advanced users can configure + tensor/pipeline/context/expert parallelism for large models. + """ + + num_gpus_per_node: int = Field(default=1, gt=0, description="Number of gpus per node.") + num_nodes: int = Field(default=1, gt=0, description="Number of nodes.") + tensor_parallel_size: int = Field(default=1, gt=0, description="Tensor parallel size.") + pipeline_parallel_size: int = Field(default=1, gt=0, description="Pipeline parallel size.") + context_parallel_size: int = Field(default=1, gt=0, description="Context parallel size.") + expert_parallel_size: Optional[int] = Field(default=None, gt=0, description="Expert parallel size (MoE models).") + sequence_parallel: bool = Field(default=False, description="Enable sequence parallelism.") + + +# ============================================================ +# Training Method Discriminated Union +# ============================================================ + + +class _TrainingBase(BaseModel): + """Common training configuration shared by all methods. + + Flat hyperparameters match the ML practitioner mental model + (like HuggingFace TrainingArguments / TRL SFTConfig). + Only parallelism is grouped — it's enterprise infrastructure. + """ + + # --- PEFT (orthogonal to training method) --- + peft: Optional[PeftMethod] = Field( + default=None, + description="PEFT adapter configuration. If set, trains a parameter-efficient adapter. " + "If omitted, performs full-weight fine-tuning.", + ) + + # --- Optimizer --- + learning_rate: float = Field( + default=1e-4, + description="Peak learning rate. Optimal value will depend on training type and PEFT. " + "For SFT without LoRA, start with 5e-5. If using LoRA start with 1e-4. Lowering the value " + "can enable for slower, more precise training; Raising the value speeds up learning.", + ) + min_learning_rate: Optional[float] = Field( + default=None, + description="Minimum learning rate for cosine decay. Optional; used with learning rate schedules.", + ) + weight_decay: float = Field( + default=0.01, + description="Weight decay coefficient. Helps prevent overfitting.", + ) + adam_beta1: float = Field( + default=0.9, + description="Adam beta1 parameter. Adjust for optimizer tuning.", + ) + adam_beta2: float = Field( + default=0.999, + description="Adam beta2 parameter. Adjust for optimizer tuning.", + ) + warmup_steps: int = Field( + default=0, + ge=0, + description="Linear warmup steps. Recommended: 10% of total training steps for stable training.", + ) + optimizer: Optional[str] = Field(default=None, description="Optimizer name (e.g., 'adamw').") + + # --- Schedule --- + epochs: int = Field( + default=1, + gt=0, + description="Number of complete passes through the dataset. The ideal number of epochs depends " + "on the training method, the number of training samples, and size of the model. Start with 3 for " + "a reasonable value. Monitor the validation and training loss curves. If both are still " + "decreasing, you can increase this number.", + ) + max_steps: Optional[int] = Field( + default=None, + gt=0, + description="Max training steps. Overrides epochs if set.", + ) + log_every_n_steps: Optional[int] = Field( + default=None, + description="Logging frequency in steps. Controls how often training metrics are logged.", + ) + val_check_interval: Optional[float] = Field( + default=None, + description="Validation interval. Float <= 1.0 is fraction of epoch; > 1.0 is step count.", + ) + + # --- Batch --- + batch_size: int = Field( + default=32, + gt=0, + description="Global batch size across all GPUs. Higher = faster but more memory. If OOM, reduce this first.", + ) + micro_batch_size: int = Field( + default=1, + gt=0, + description="Per-GPU micro batch size. Keep small (1-2) for large models to avoid OOM.", + ) + sequence_packing: bool = Field( + default=False, + description="Enable sequence packing for efficiency. Can improve training speed.", + ) + + # --- Model --- + max_seq_length: int = Field( + default=2048, + gt=0, + description="Maximum token sequence length for training. Higher = more memory, longer training.", + ) + precision: Optional[Precision] = Field( + default=None, + description="Model precision for training. Auto-detected if unset.", + ) + seed: Optional[int] = Field( + default=None, + description="Random seed for reproducibility. Optional.", + ) + + # --- Enterprise Infrastucture --- + parallelism: ParallelismParams = Field(default_factory=ParallelismParams) + execution_profile: Optional[str] = Field( + default=None, + min_length=1, + description="Execution profile for the GPU training step. Maps to an operator-configured profile " + "(e.g., 'a100', 'high_priority'). If omitted, uses the service-level default.", + ) + + model_config = {"protected_namespaces": ()} + + @property + def finetuning_type(self) -> FinetuningType: + """Derived from peft config: presence → adapter type, absence → full-weight.""" + if self.peft is None: + return FinetuningType.ALL_WEIGHTS + if isinstance(self.peft, LoRAParams): + return FinetuningType.LORA_MERGED if self.peft.merge else FinetuningType.LORA + raise ValueError(f"Unknown PEFT type: {type(self.peft).__name__}") + + +class SFTTraining(_TrainingBase): + """Supervised Fine-Tuning.""" + + type: Literal["sft"] = "sft" + + +class DistillationTraining(_TrainingBase): + """Knowledge Distillation with a teacher model. + + Customizer's differentiator — not available in Unsloth. + Trains the student model to match the teacher's output distribution. + """ + + type: Literal["distillation"] = "distillation" + teacher_model: str = Field( + description="Teacher model URN (e.g., 'workspace/model-name'). " + "Must have the same vocabulary as the student model.", + ) + teacher_precision: Literal["bf16", "fp16", "fp32"] = Field( + default="bf16", + description="Precision for loading the frozen teacher model. " + "Lower precision reduces memory but may affect logit quality.", + ) + distillation_ratio: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Balance between CE loss and KD loss. 0.0 = CE only, 1.0 = KD only.", + ) + distillation_temperature: float = Field( + default=1.0, + gt=0.0, + description="Softmax temperature for KD. Higher = softer probability distributions.", + ) + + +class DPOTraining(_TrainingBase): + """Direct Preference Optimization.""" + + type: Literal["dpo"] = "dpo" + ref_policy_kl_penalty: float = Field( + default=0.05, ge=0.0, description="KL penalty coefficient (beta in DPO paper)." + ) + preference_average_log_probs: bool = Field( + default=False, description="Average log probabilities for preference loss calculation." + ) + sft_average_log_probs: bool = Field( + default=False, description="Average log probabilities for SFT regularization loss." + ) + preference_loss_weight: float = Field(default=1.0, ge=0.0, description="Weight for the preference (DPO) loss term.") + sft_loss_weight: float = Field( + default=0.0, ge=0.0, description="Weight for SFT regularization loss (0 = disabled)." + ) + max_grad_norm: float = Field(default=1.0, ge=0.0, description="Maximum gradient norm for clipping.") + + @model_validator(mode="after") + def _peft_not_yet_supported(self) -> Self: + if self.peft is not None: + raise ValueError( + "PEFT is not yet supported with DPO training. Use full-weight training by omitting the 'peft' field." + ) + return self + + +AnyTraining = Union[SFTTraining, DistillationTraining, DPOTraining] +TrainingMethod = Annotated[AnyTraining, Discriminator("type")] + + +# ============================================================ +# Integration Configs (unchanged) +# ============================================================ + + +class WandBParams(BaseModel): + """Weights & Biases integration configuration. + + To use W&B, provide an api_key_secret referencing a secret that contains + the WANDB_API_KEY value. Optionally provide base_url for self-hosted W&B servers. + """ + + project: Optional[str] = Field( + default=None, + description="W&B project name (groups related runs). Defaults to output.name if not set.", + ) + name: Optional[str] = Field( + default=None, + description="W&B run name. Defaults to job_id if not provided.", + ) + entity: Optional[str] = Field( + default=None, + description="W&B entity (team or username).", + ) + tags: Optional[list[str]] = Field( + default=None, + description="W&B tags for filtering runs.", + ) + notes: Optional[str] = Field( + default=None, + description="W&B notes/description for the run.", + ) + base_url: Optional[str] = Field( + default=None, + description="Base URL for self-hosted W&B server (e.g., 'https://wandb.mycompany.com'). " + "If not provided, uses the default W&B cloud service.", + ) + api_key_secret: SecretRef | None = Field( + default=None, + description="Reference to a secret containing the WANDB_API_KEY. " + "Format: 'secret_name' (uses request workspace) or 'workspace/secret_name' (explicit workspace).", + ) + + +class MLflowParams(BaseModel): + """MLflow integration configuration.""" + + experiment_name: Optional[str] = Field( + default=None, + description="MLflow experiment name (groups related runs). Defaults to output.name if not set.", + ) + run_name: Optional[str] = Field( + default=None, + description="MLflow run name. Defaults to job_id if not provided.", + ) + tags: Optional[dict[str, str]] = Field( + default=None, + description="MLflow tags as key-value pairs for filtering runs.", + ) + description: Optional[str] = Field( + default=None, + description="MLflow run description.", + ) + tracking_uri: Optional[str] = Field( + default=None, + description="MLflow tracking server URI (e.g., 'http://mlflow.mycompany.com:5000'). " + "Can also be set via MLFLOW_TRACKING_URI environment variable.", + ) + + +class IntegrationParams(BaseModel): + """Third-party integration configurations. + + Each integration type has its own optional field. To enable an integration, + provide its configuration object. Omit or set to None to disable. + """ + + wandb: Optional[WandBParams] = Field( + default=None, + description="Weights & Biases integration configuration.", + ) + mlflow: Optional[MLflowParams] = Field( + default=None, + description="MLflow integration configuration.", + ) + + +# ============================================================ +# Deployment Config +# ============================================================ + + +class ToolCallParams(BaseModel): + """Tool calling configuration for NIM deployments.""" + + tool_call_parser: Optional[str] = Field( + default=None, + description="Name of the tool call parser to use (e.g., 'openai', 'hermes', 'pythonic', 'llama3_json', 'mistral').", + ) + tool_call_plugin: Optional[str] = Field( + default=None, + pattern=r"^[\w\-.]+/[\w\-.]+$", + description="Reference to a fileset containing the custom tool call plugin Python file. " + "Expected format: '{workspace}/{fileset_name}'.", + ) + auto_tool_choice: Optional[bool] = Field( + default=None, + description="Whether to enable automatic tool choice.", + ) + + +class DeploymentParams(BaseModel): + """Inline deployment parameters for creating a new ModelDeploymentConfig.""" + + gpu: int = Field( + default=1, + description="Number of GPUs required for the deployment", + ) + + additional_envs: Optional[dict[str, str]] = Field( + default=None, + description="Additional environment variables for the deployment", + ) + + disk_size: Optional[str] = Field( + default=None, + description="Disk size for the deployment", + ) + + image_name: Optional[str] = Field( + default=None, + description="Container image name from NGC. If not specified, defaults to multi-llm", + ) + + image_tag: Optional[str] = Field( + default=None, + description="Container image tag from NGC", + ) + + lora_enabled: bool = Field( + default=True, + description="When automatically deploying a full SFT training, this parameter being set to true will allow subsequent LoRA adapters to be trained and deployed against it.", + ) + + tool_call_config: Optional[ToolCallParams] = Field( + default=None, + description="Tool calling configuration override for the NIM deployment.", + ) + + +# ============================================================ +# Output +# ============================================================ + + +class _OutputBase(BaseModel): + """Shared fields for output artifact request and response.""" + + name: str = Field( + pattern=REGEX_WORD_CHARACTER_DOT_DASH, + max_length=MAX_LENGTH_255, + description="Name of the output artifact. Used to identify it during deployment and inference.", + examples=["my-finetuned-llama", "llama-3-8b-lora-v2"], + ) + + +class OutputRequest(_OutputBase): + """Output artifact configuration provided by the user.""" + + +class OutputResponse(_OutputBase): + """Resolved output artifact details returned by the server.""" + + type: OutputNameType = Field( + description="Output artifact type. Either `model` (full fine-tuned weights) or `adapter` (LoRA adapter weights).", + examples=["model", "adapter"], + ) + fileset: str = Field( + pattern=REGEX_WORD_CHARACTER_DOT_DASH, + max_length=MAX_LENGTH_255, + description="FileSet name where output artifacts are stored.", + examples=["my-model-a1b2c3d4e5f6"], + ) + + +# ============================================================ +# Job Schemas +# ============================================================ + + +class _CustomizationJobBase(BaseModel): + """Base schema with common fields for customization jobs.""" + + model: str = Field(description="Model reference (e.g., 'workspace/model-name').") + dataset: Annotated[str, AfterValidator(validate_fileset_uri)] = Field( + description="Training dataset fileset as 'workspace/name' or 'name' (resolved in the job path workspace)." + ) + training: TrainingMethod = Field(description="Training method and hyperparameters.") + integrations: Optional[IntegrationParams] = Field( + default=None, + description="Third-party integrations (e.g., Weights & Biases, MLflow).", + ) + deployment_config: Optional[str | DeploymentParams] = Field( + default=None, + description="Deployment configuration for auto-deploying the model after training. " + "Pass a string to reference an existing ModelDeploymentConfig by name " + "(e.g., 'my-config' or 'workspace/my-config'). " + "An object provides inline NIM deployment parameters. " + "Omit to skip deployment.", + ) + custom_fields: Dict[str, Any] = Field(default_factory=dict, description="Custom user-defined fields.") + + model_config = ConfigDict(protected_namespaces=(), regex_engine="python-re") + + +class CustomizationJobInput(_CustomizationJobBase): + """Input schema for creating customization jobs.""" + + output: Optional[OutputRequest] = Field( + default=None, + description="Output artifact configuration. If omitted, name is auto-generated as " + "`{model}-{dataset}-`. The output type (model vs adapter) is always " + "inferred from the training configuration.", + examples=[{"name": "my-finetuned-llama"}], + ) + + @model_validator(mode="before") + @classmethod + def reject_legacy_fields(cls, data: object) -> object: + if isinstance(data, dict) and "output_model" in data: + raise ValueError("spec.output_model was removed. Use spec.output instead.") + return data + + @model_validator(mode="after") + def _reject_lora_without_lora_enabled(self) -> Self: + peft = self.training.peft + dc = self.deployment_config + if isinstance(peft, LoRAParams) and not peft.merge and isinstance(dc, DeploymentParams) and not dc.lora_enabled: + raise ValueError( + "deployment_config.lora_enabled must be true (or omitted) when training a LoRA adapter. " + "Setting lora_enabled=false would deploy the base model without LoRA support, " + "making the trained adapter unservable." + ) + return self + + +class CustomizationJobOutput(_CustomizationJobBase): + """Customization job details returned by the server.""" + + output: OutputResponse = Field( + description="Output artifact created by this job.", + examples=[ + {"name": "my-finetuned-llama", "type": "model", "fileset": "my-finetuned-llama"}, + {"name": "llama-3-8b-lora-v2", "type": "adapter", "fileset": "llama-3-8b-lora-v2-a1b2c3d4e5f6"}, + ], + ) + + def validate_for_training(self) -> None: + """Validate this job input for training execution. + + Call this after any enrichment has been applied. + + Raises: + ValidationError: If validation fails. + """ + training = self.training + p = training.parallelism + num_nodes = p.num_nodes + num_gpus_per_node = p.num_gpus_per_node + tp = p.tensor_parallel_size + pp = p.pipeline_parallel_size + cp = p.context_parallel_size + ep = p.expert_parallel_size + + total_gpus = num_gpus_per_node * num_nodes + model_parallel_size = tp * pp * cp + if total_gpus % model_parallel_size != 0: + raise ValidationError( + f"Total GPUs ({total_gpus}) must be divisible by " + f"tensor_parallel_size ({tp}) * " + f"pipeline_parallel_size ({pp}) * " + f"context_parallel_size ({cp}) = {model_parallel_size}" + ) + + derived_dp = total_gpus // model_parallel_size + + # Note: Expert model parallelism (EP) is NOT a dimension that divides world_size like TP/PP. + # Instead, EP operates orthogonally, therefore we validate it separately. + # It distributes experts across the dp × cp dimension. + # FSDP2 requires: (dp_size × cp_size) % ep_size == 0 + if ep is not None: + dp_cp = derived_dp * cp + if dp_cp % ep != 0: + raise ValidationError( + f"(data_parallel_size * context_parallel_size) ({derived_dp} * {cp} = {dp_cp}) " + f"must be divisible by expert_parallel_size ({ep})" + ) + # MoE models on multi-GPU don't support tensor parallelism + # in Automodel's MoE parallelizer. See: nemo_automodel/components/moe/parallelizer.py + if ep > 1 and tp > 1 and total_gpus > 1: + raise ValidationError( + f"Tensor parallelism (tensor_parallel_size={tp}) is not supported for MoE models. " + f"When expert_parallel_size > 1 ({ep}), tensor_parallel_size must be 1." + ) + + gb = training.batch_size + mb = training.micro_batch_size + divisor = mb * derived_dp + if gb % divisor != 0: + raise ValidationError( + f"batch_size ({gb}) must be divisible by " + f"micro_batch_size ({mb}) * data_parallel_size ({derived_dp}) = {divisor}. " + f"Consider adjusting batch_size to {divisor * max(1, gb // divisor)} or {divisor * (gb // divisor + 1)}." + ) diff --git a/services/automodel/src/nmp/automodel/app/__init__.py b/services/automodel/src/nmp/automodel/app/__init__.py new file mode 100644 index 00000000..35a0c911 --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Customizer application module.""" diff --git a/services/automodel/src/nmp/automodel/app/constants.py b/services/automodel/src/nmp/automodel/app/constants.py new file mode 100644 index 00000000..083498ad --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/constants.py @@ -0,0 +1,173 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from nmp.common.jobs.constants import DEFAULT_JOB_STORAGE_PATH + +SERVICE_NAME = "customizer" + +# Global default seed for reproducibility +DEFAULT_SEED = 1111 + +# Relative directory names (used as subdirectory names under job storage) +DEFAULT_MODEL_OUTPUT_DIR_NAME = "model" +DEFAULT_DATASET_OUTPUT_DIR_NAME = "dataset" +DEFAULT_TEACHER_MODEL_DIR_NAME = "teacher_model" +DEFAULT_TRAINING_OUTPUT_DIR_NAME = "training" +DEFAULT_OUTPUT_MODEL_DIR_NAME = "output_model" +DEFAULT_TRAINING_RESULT_FILE_NAME = "customizer_training_result.json" + +# Absolute paths (used in PlatformJobSpec for cross-step file sharing via PVC) +DEFAULT_MODEL_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_MODEL_OUTPUT_DIR_NAME}" +DEFAULT_DATASET_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_DATASET_OUTPUT_DIR_NAME}" +DEFAULT_TEACHER_MODEL_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_TEACHER_MODEL_DIR_NAME}" +DEFAULT_TRAINING_OUTPUT_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_TRAINING_OUTPUT_DIR_NAME}" +DEFAULT_OUTPUT_MODEL_PATH = f"{DEFAULT_JOB_STORAGE_PATH}/{DEFAULT_OUTPUT_MODEL_DIR_NAME}" + +NMP_JOBS_URL_ENVVAR = "NMP_JOBS_URL" +NMP_FILES_URL_ENVVAR = "NMP_FILES_URL" + +# Models whose checkpoints require transformers-v4-compatible config.json output. +# When v4_compatible is enabled, the original pretrained config.json is preserved +# alongside a config.v5.json so downstream consumers (e.g. vLLM) that expect +# a v4-format config continue to work. +# using frozenset for faster lookup +V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: frozenset[str] = frozenset( + { + "ApertusForCausalLM", + "ArceeForCausalLM", + "AriaTextForCausalLM", + "BambaForCausalLM", + "BartForCausalLM", + "BertLMHeadModel", + "BertGenerationDecoder", + "BigBirdForCausalLM", + "BigBirdPegasusForCausalLM", + "BioGptForCausalLM", + "BitNetForCausalLM", + "BlenderbotForCausalLM", + "BlenderbotSmallForCausalLM", + "BloomForCausalLM", + "BltForCausalLM", + "CamembertForCausalLM", + "LlamaForCausalLM", + "CodeGenForCausalLM", + "CohereForCausalLM", + "Cohere2ForCausalLM", + "CpmAntForCausalLM", + "CTRLLMHeadModel", + "Data2VecTextForCausalLM", + "DbrxForCausalLM", + "DeepseekV2ForCausalLM", + "DeepseekV3ForCausalLM", + "DiffLlamaForCausalLM", + "DogeForCausalLM", + "Dots1ForCausalLM", + "ElectraForCausalLM", + "Emu3ForCausalLM", + "ErnieForCausalLM", + "Ernie4_5ForCausalLM", + "Ernie4_5_MoeForCausalLM", + "Exaone4ForCausalLM", + "FalconForCausalLM", + "FalconH1ForCausalLM", + "FalconMambaForCausalLM", + "FlexOlmoForCausalLM", + "FuyuForCausalLM", + "GemmaForCausalLM", + "Gemma2ForCausalLM", + "Gemma3ForConditionalGeneration", + "Gemma3ForCausalLM", + "Gemma3nForConditionalGeneration", + "Gemma3nForCausalLM", + "GitForCausalLM", + "GlmForCausalLM", + "Glm4ForCausalLM", + "Glm4MoeForCausalLM", + "GotOcr2ForConditionalGeneration", + "GPT2LMHeadModel", + "GPTBigCodeForCausalLM", + "GPTNeoForCausalLM", + "GPTNeoXForCausalLM", + "GPTNeoXJapaneseForCausalLM", + "GptOssForCausalLM", + "GPTJForCausalLM", + "GraniteForCausalLM", + "GraniteMoeForCausalLM", + "GraniteMoeHybridForCausalLM", + "GraniteMoeSharedForCausalLM", + "HeliumForCausalLM", + "HunYuanDenseV1ForCausalLM", + "HunYuanMoEV1ForCausalLM", + "JambaForCausalLM", + "JetMoeForCausalLM", + "Lfm2ForCausalLM", + "Llama4ForCausalLM", + "LongcatFlashForCausalLM", + "MambaForCausalLM", + "Mamba2ForCausalLM", + "MarianForCausalLM", + "MBartForCausalLM", + "MegaForCausalLM", + "MegatronBertForCausalLM", + "MiniMaxForCausalLM", + "MinistralForCausalLM", + "MistralForCausalLM", + "MixtralForCausalLM", + "MllamaForCausalLM", + "ModernBertDecoderForCausalLM", + "MoshiForCausalLM", + "MptForCausalLM", + "MusicgenForCausalLM", + "MusicgenMelodyForCausalLM", + "MvpForCausalLM", + "NemotronForCausalLM", + "OlmoForCausalLM", + "Olmo2ForCausalLM", + "Olmo3ForCausalLM", + "OlmoeForCausalLM", + "OpenLlamaForCausalLM", + "OpenAIGPTLMHeadModel", + "OPTForCausalLM", + "PegasusForCausalLM", + "PersimmonForCausalLM", + "PhiForCausalLM", + "Phi3ForCausalLM", + "Phi4MultimodalForCausalLM", + "PhimoeForCausalLM", + "PLBartForCausalLM", + "ProphetNetForCausalLM", + "QDQBertLMHeadModel", + "Qwen2ForCausalLM", + "Qwen2MoeForCausalLM", + "Qwen3ForCausalLM", + "Qwen3MoeForCausalLM", + "Qwen3NextForCausalLM", + "RecurrentGemmaForCausalLM", + "ReformerModelWithLMHead", + "RemBertForCausalLM", + "RobertaForCausalLM", + "RobertaPreLayerNormForCausalLM", + "RoCBertForCausalLM", + "RoFormerForCausalLM", + "RwkvForCausalLM", + "SeedOssForCausalLM", + "SmolLM3ForCausalLM", + "Speech2Text2ForCausalLM", + "StableLmForCausalLM", + "Starcoder2ForCausalLM", + "TransfoXLLMHeadModel", + "TrOCRForCausalLM", + "VaultGemmaForCausalLM", + "WhisperForCausalLM", + "XGLMForCausalLM", + "XLMWithLMHeadModel", + "XLMProphetNetForCausalLM", + "XLMRobertaForCausalLM", + "XLMRobertaXLForCausalLM", + "XLNetLMHeadModel", + "xLSTMForCausalLM", + "XmodForCausalLM", + "ZambaForCausalLM", + "Zamba2ForCausalLM", + } +) diff --git a/services/automodel/src/nmp/automodel/app/jobs/__init__.py b/services/automodel/src/nmp/automodel/app/jobs/__init__.py new file mode 100644 index 00000000..e5725ea5 --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/services/automodel/src/nmp/automodel/app/jobs/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/compiler.py new file mode 100644 index 00000000..f43b1190 --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/compiler.py @@ -0,0 +1,501 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Job compiler - transforms CustomizationJobOutput into PlatformJobSpec.""" + +import logging + +from nemo_platform import AsyncNeMoPlatform, NotFoundError +from nemo_platform.types.models.model_entity import ModelEntity +from nmp.automodel.api.v2.jobs.schemas import ( + CustomizationJobOutput, + DeploymentParams, + DistillationTraining, + LoRAParams, + ValidationError, +) +from nmp.automodel.app.constants import ( + DEFAULT_DATASET_PATH, + DEFAULT_MODEL_PATH, + DEFAULT_OUTPUT_MODEL_PATH, + DEFAULT_TEACHER_MODEL_PATH, +) +from nmp.automodel.app.jobs.file_io.schemas import ( + DownloadItem, + FileIOTaskConfig, + FileSetRef, + UploadItem, +) +from nmp.automodel.app.jobs.model_entity.schemas import ( + DeploymentParameters as ModelEntityDeploymentParameters, +) +from nmp.automodel.app.jobs.model_entity.schemas import ( + ModelEntityTaskConfig, +) +from nmp.automodel.app.jobs.model_entity.schemas import ( + PEFTConfig as ModelEntityPEFTConfig, +) +from nmp.automodel.app.jobs.training.compiler import ( + _extract_model_name, + _resolve_is_embedding_model, + compile_training_step, +) +from nmp.automodel.config import config +from nmp.automodel.entities.values import FinetuningType +from nmp.automodel.images import AUTOMODEL_PYTHON_ENTRYPOINT, get_tasks_image +from nmp.automodel.platform_client import fetch_model_entity +from nmp.common.auth import AuthClient, auth_client_context +from nmp.common.entities.utils import parse_entity_ref +from nmp.common.jobs.api_factory import ( + ContainerSpec, + CPUExecutionProviderSpec, + EnvironmentVariable, + PlatformJobSpec, + PlatformJobStep, + ResourcesLimitsSpec, + ResourcesRequestsSpec, + ResourcesSpec, +) +from nmp.common.jobs.constants import DEFAULT_JOB_STORAGE_PATH, PERSISTENT_JOB_STORAGE_PATH_ENVVAR +from nmp.common.jobs.exceptions import PlatformJobCompilationError + +logger = logging.getLogger(__name__) + + +def _get_cpu_resources() -> ResourcesSpec: + """Get default CPU resources for download/upload tasks.""" + return ResourcesSpec( + limits=ResourcesLimitsSpec( + cpu=config.default_job_resource_cpu_limit, + memory=config.default_job_resource_memory_limit, + ), + requests=ResourcesRequestsSpec( + cpu=config.default_job_resource_cpu_request, + memory=config.default_job_resource_memory_request, + ), + ) + + +def _get_base_environment() -> list[EnvironmentVariable]: + """Get base environment variables for all tasks.""" + return [ + EnvironmentVariable( + name=PERSISTENT_JOB_STORAGE_PATH_ENVVAR, + value=DEFAULT_JOB_STORAGE_PATH, + ), + ] + + +def _extract_model_uri(me: ModelEntity) -> str | None: + """Extract model_uri from the model entity. + + Args: + me: The model entity. + + Returns: + The fileset string if available, None otherwise. + """ + return me.fileset if me.fileset else None + + +def _require_fileset_for_download(fileset_name: str | None, entity_label: str) -> str: + """Require a platform fileset reference for checkpoint download.""" + if not fileset_name or not str(fileset_name).strip(): + raise PlatformJobCompilationError( + f"{entity_label} has no fileset. " + "Attach a platform FileSet (workspace/name) with model weights before running training.", + ) + return str(fileset_name) + + +def _append_download_if_present( + downloads: list[DownloadItem], + fileset_name: str | None, + dest: str, + field_name: str, +) -> None: + """Append a download item if a FileSet ref is present.""" + if not fileset_name: + return + fileset = FileSetRef.model_validate(fileset_name) + downloads.append(DownloadItem(src=fileset, dest=dest)) + logger.info(f"Detected {field_name} FileSet reference: {fileset}") + + +def _build_file_download_config( + job_spec: CustomizationJobOutput, + me: ModelEntity, + teacher_me: ModelEntity | None = None, +) -> FileIOTaskConfig: + """Build the configuration for the file_io task. + + Extracts FileSet references from model_uri and dataset fields. + Fileset refs use workspace/name or name (optional legacy fileset:// prefix is stripped). + + Args: + job_spec: The customization job output specification. + me: The model entity being trained. + teacher_me: Optional teacher model entity for knowledge distillation jobs. + + Returns: + FileIOTaskConfig with download items for any fileset refs found. + + """ + downloads: list[DownloadItem] = [] + + model_fileset = _require_fileset_for_download( + _extract_model_uri(me), + entity_label=f"Model '{me.workspace}/{me.name}'", + ) + _append_download_if_present( + downloads, + fileset_name=model_fileset, + dest=DEFAULT_MODEL_PATH, + field_name="model", + ) + _append_download_if_present( + downloads, + fileset_name=job_spec.dataset, + dest=DEFAULT_DATASET_PATH, + field_name="dataset", + ) + + if teacher_me is not None: + teacher_fileset = _require_fileset_for_download( + _extract_model_uri(teacher_me), + entity_label=f"Teacher model '{teacher_me.workspace}/{teacher_me.name}'", + ) + _append_download_if_present( + downloads, + fileset_name=teacher_fileset, + dest=DEFAULT_TEACHER_MODEL_PATH, + field_name="teacher_model", + ) + + return FileIOTaskConfig(download=downloads) + + +def _build_output_fileset_metadata(me: ModelEntity) -> dict | None: + """Build tool_calling metadata to propagate to the output fileset. + + Extracts chat_template and tool_call_config from the source model entity's spec + so the model-spec-runner will apply them to the output model entity. + + Returns: + A dict like {"tool_calling": {...}} suitable for fileset metadata, or None + if there is nothing to propagate. + """ + if me.spec is None: + return None + + tool_calling: dict = {} + + if me.spec.chat_template: + tool_calling["chat_template"] = me.spec.chat_template + + if me.spec.tool_call_config: + tcc = me.spec.tool_call_config + if tcc.tool_call_parser: + tool_calling["tool_call_parser"] = tcc.tool_call_parser + if tcc.tool_call_plugin: + tool_calling["tool_call_plugin"] = tcc.tool_call_plugin + if tcc.auto_tool_choice is not None: + tool_calling["auto_tool_choice"] = tcc.auto_tool_choice + + return {"tool_calling": tool_calling} if tool_calling else None + + +def _build_file_upload_config( + output_fileset_name: str, + fileset_metadata: dict | None = None, +) -> FileIOTaskConfig: + """Build the configuration for the file_io upload task with a generated fileset name. + + The fileset name is generated at compile time and will be combined with + the job's workspace at runtime to form the full FileSet reference. + + Args: + output_fileset_name: The generated name for the output FileSet. + fileset_metadata: Optional metadata to set on the output fileset (e.g., tool_calling + config propagated from the source model entity). + + Returns: + FileIOTaskConfig with upload items configured to use the generated name. + """ + return FileIOTaskConfig( + upload=[ + UploadItem( + src=DEFAULT_OUTPUT_MODEL_PATH, + # workspace is None because at this layer, we don't know the job's workspace. + dest=FileSetRef(workspace=None, name=output_fileset_name), + metadata=fileset_metadata, + ) + ], + ) + + +def _build_model_entity_config( + workspace: str, job_spec: CustomizationJobOutput, trust_remote_code: bool = False +) -> ModelEntityTaskConfig: + """Build the configuration for the model_entity task. + + Args: + workspace: The workspace for this job. + job_spec: The customization job input specification. + trust_remote_code: Whether to trust remote code for the checkpoint. + + Returns: + ModelEntityTaskConfig with model entity creation settings. + """ + base_model = _extract_model_name(job_spec) + + assert job_spec.output is not None, "output must be set by input-to-output transformer" + training = job_spec.training + + peft_config: ModelEntityPEFTConfig | None = None + if isinstance(training.peft, LoRAParams): + peft_config = ModelEntityPEFTConfig( + type=training.finetuning_type, + alpha=training.peft.alpha, + rank=training.peft.rank, + ) + + # Only forward the user-supplied deployment_config from the job spec. + # tool_call_config from the *source* model entity's spec is propagated + # separately via fileset metadata (see _build_output_fileset_metadata), + # so we intentionally do not merge it here. + deployment_config: str | ModelEntityDeploymentParameters | None = None + if isinstance(job_spec.deployment_config, str): + deployment_config = job_spec.deployment_config + elif job_spec.deployment_config is not None: + deployment_config = ModelEntityDeploymentParameters.model_validate(job_spec.deployment_config.model_dump()) + + return ModelEntityTaskConfig( + name=job_spec.output.name, + workspace=workspace, + description="Customized model from job", + fileset=FileSetRef( + workspace=None, + name=job_spec.output.fileset, + ), + base_model=base_model, + model_entity=job_spec.model, + peft=peft_config, + trust_remote_code=trust_remote_code, + deployment_config=deployment_config, + ) + + +async def _resolve_deployment_config_ref( + config_ref: str, + workspace: str, + sdk: AsyncNeMoPlatform, +): + """Resolve a ``name`` or ``workspace/name`` string to a ModelDeploymentConfig.""" + ref = parse_entity_ref(config_ref, default_workspace=workspace) + try: + return await sdk.inference.deployment_configs.retrieve(name=ref.name, workspace=ref.workspace) + except NotFoundError as e: + raise PlatformJobCompilationError( + f"deployment_config references '{config_ref}' which does not exist in workspace '{ref.workspace}'." + ) from e + except Exception as e: + raise PlatformJobCompilationError(f"Failed to resolve deployment_config '{config_ref}': {e}") from e + + +async def _validate_deployment_config( + workspace: str, + transformed_spec: CustomizationJobOutput, + sdk: AsyncNeMoPlatform, + auth_client: AuthClient, +) -> None: + """Validate deployment_config consistency before training starts. + + Catches contradictory or impossible configurations early so the user + gets a clear error instead of a silent failure after expensive training. + """ + dc = transformed_spec.deployment_config + if dc is None: + return + + # Inline deployment params: check permission-gated fields. + if isinstance(dc, DeploymentParams): + tcc = dc.tool_call_config + if tcc and tcc.tool_call_plugin: + if not await auth_client.has_permissions(workspace, ["models.tool-call-plugin.set"]): + raise PlatformJobCompilationError( + "Insufficient permissions to set tool_call_plugin. " + "Requires the models.tool-call-plugin.set permission." + ) + return + + # String reference to an existing deployment config: validate consistency. + if not isinstance(dc, str): + return + + ft_type = transformed_spec.training.finetuning_type + is_lora = ft_type == FinetuningType.LORA + produces_new_model = ft_type in (FinetuningType.ALL_WEIGHTS, FinetuningType.LORA_MERGED) + resolved_config = await _resolve_deployment_config_ref(dc, workspace, sdk) + + # LoRA job referencing a config that has lora_enabled=False + if is_lora and resolved_config.nim_deployment and resolved_config.nim_deployment.lora_enabled is False: + raise PlatformJobCompilationError( + f"deployment_config references '{dc}' which has lora_enabled=false, " + "but this is a LoRA training job. The deployment would not load LoRA adapters. " + "Use a deployment config with lora_enabled=true, or provide inline deployment parameters." + ) + + # SFT or lora_merged referencing a string config + if produces_new_model: + output_name = transformed_spec.output.name + try: + existing_me = await sdk.models.retrieve(name=output_name, workspace=workspace) + except NotFoundError: + # Output model entity doesn't exist yet, so a string + # ref is inherently invalid -- it was created for a different model. + raise PlatformJobCompilationError( + f"deployment_config cannot be a string reference ('{dc}') for {ft_type.value} training " + "that creates a new model entity. The referenced config was created for a different model. " + "Use inline deployment parameters (e.g., DeploymentParams(gpu=1, lora_enabled=True)) instead." + ) + + # Output model entity already exists (retraining to create a new FileSet). + # Verify the config actually targets this model entity. + nim = resolved_config.nim_deployment + config_targets_model = (resolved_config.model_entity_id == f"{existing_me.workspace}/{existing_me.name}") or ( + nim and nim.model_name == existing_me.name and nim.model_namespace == existing_me.workspace + ) + if not config_targets_model: + raise PlatformJobCompilationError( + f"deployment_config references '{dc}' which targets a different model entity " + f"than the output model '{existing_me.workspace}/{existing_me.name}'. " + "The deployment config must target the same model entity being retrained, " + "or use inline deployment parameters instead." + ) + + +async def platform_job_config_compiler( + workspace: str, + job_spec: CustomizationJobOutput, + sdk: AsyncNeMoPlatform, +) -> PlatformJobSpec: + """Compile canonical job spec into a four-step PlatformJobSpec.""" + transformed_spec = job_spec + logger.info("Compiling Automodel job to PlatformJobSpec: %s", transformed_spec.model_dump_json(indent=2)) + + try: + transformed_spec.validate_for_training() + except ValidationError as e: + raise PlatformJobCompilationError(str(e)) from e + + # output is a required field in CustomizationJobOutput + cpu_resources = _get_cpu_resources() + base_env = _get_base_environment() + + # Fetch the primary model entity + me = await fetch_model_entity(transformed_spec.model, workspace, sdk) + + # For distillation jobs, also fetch the teacher model entity + teacher_me: ModelEntity | None = None + if isinstance(transformed_spec.training, DistillationTraining): + try: + teacher_me = await fetch_model_entity(transformed_spec.training.teacher_model, workspace, sdk) + except ValueError as e: + raise PlatformJobCompilationError( + f"Teacher model '{transformed_spec.training.teacher_model}' not found. " + "Verify the teacher model entity exists." + ) from e + except PermissionError as e: + raise PlatformJobCompilationError( + f"Access denied to teacher model '{transformed_spec.training.teacher_model}'." + ) from e + + if transformed_spec.deployment_config is not None: + auth_client = auth_client_context.get() + if auth_client is None: + raise PlatformJobCompilationError( + "No auth context available; cannot validate deployment config permissions.", + ) + await _validate_deployment_config(workspace, transformed_spec, sdk, auth_client) + + file_io_download_config = _build_file_download_config(transformed_spec, me, teacher_me) + is_embedding_model_flag = _resolve_is_embedding_model(me) + + # The embedding NIM requires ONNX format, which cannot represent standalone LoRA adapters. + # LoRA with merge=True (lora_merged) is allowed because it produces a full-weight model after training. + if is_embedding_model_flag and transformed_spec.training.finetuning_type == FinetuningType.LORA: + raise PlatformJobCompilationError( + "NeMo Platform does not support unmerged LoRA for embedding models because the embedding NIM requires ONNX format, " + "which cannot represent standalone adapters. " + "Use peft with merge=True (lora_merged) or omit peft for all_weights training." + ) + + # Extract chat_template and tool_call_config from the source model entity's spec + # (populated from fileset metadata by the model-spec-runner background task). + # These are propagated to: + # 1. The training step config (chat_template takes highest priority in template resolution) + # 2. The output fileset metadata (so the model-spec-runner sets them on the output model) + fileset_metadata = _build_output_fileset_metadata(me) + file_io_upload_config = _build_file_upload_config(transformed_spec.output.fileset, fileset_metadata) + + # Build model_entity config for creating the model entity + trust_remote_code = me.trust_remote_code or False + model_entity_config = _build_model_entity_config(workspace, transformed_spec, trust_remote_code) + + steps = [ + # Step 1: Download model and dataset files from Files service + PlatformJobStep( + name="model-and-dataset-download", + executor=CPUExecutionProviderSpec( + provider="cpu", + container=ContainerSpec( + image=get_tasks_image(), + entrypoint=AUTOMODEL_PYTHON_ENTRYPOINT, + command=["-m", "nmp.automodel.tasks.file_io"], + ), + resources=cpu_resources, + ), + environment=base_env, + config=file_io_download_config.model_dump(mode="json"), + ), + # Step 2: Training job + compile_training_step( + transformed_spec, + base_env, + me, + teacher_me=teacher_me, + ), + # Step 3: Upload customized model + PlatformJobStep( + name="model-upload", + executor=CPUExecutionProviderSpec( + provider="cpu", + container=ContainerSpec( + image=get_tasks_image(), + entrypoint=AUTOMODEL_PYTHON_ENTRYPOINT, + command=["-m", "nmp.automodel.tasks.file_io"], + ), + resources=cpu_resources, + ), + environment=base_env, + config=file_io_upload_config.model_dump(mode="json"), + ), + # Step 4: Create model entity + PlatformJobStep( + name="model-entity-creation", + executor=CPUExecutionProviderSpec( + provider="cpu", + container=ContainerSpec( + image=get_tasks_image(), + entrypoint=AUTOMODEL_PYTHON_ENTRYPOINT, + command=["-m", "nmp.automodel.tasks.model_entity"], + ), + resources=cpu_resources, + ), + environment=base_env, + config=model_entity_config.model_dump(mode="json"), + ), + ] + + return PlatformJobSpec(steps=steps) diff --git a/services/automodel/src/nmp/automodel/app/jobs/context.py b/services/automodel/src/nmp/automodel/app/jobs/context.py new file mode 100644 index 00000000..4987dfe6 --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/context.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Self + +from nmp.automodel.app.constants import ( + DEFAULT_JOB_STORAGE_PATH, + NMP_FILES_URL_ENVVAR, + NMP_JOBS_URL_ENVVAR, +) +from nmp.common.entities.constants import DEFAULT_WORKSPACE +from nmp.common.jobs.constants import ( + DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH, + NEMO_JOB_ATTEMPT_ID_ENVVAR, + NEMO_JOB_ID_ENVVAR, + NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, + NEMO_JOB_STEP_ENVVAR, + NEMO_JOB_TASK_ENVVAR, + NEMO_JOB_WORKSPACE_ENVVAR, + PERSISTENT_JOB_STORAGE_PATH_ENVVAR, +) + +DEFAULT_JOB_ID = "unknown-job-id" +DEFAULT_ATTEMPT_ID = "attempt-0" +DEFAULT_STEP = "unknown-step" +DEFAULT_TASK = "unknown-task" + + +# Jobs task names should comply with NAME_PATTERN of EntityCreateInput.name for the Jobs API. +# Generated tasks in k8s don't start with a lowercase letter per NAME_PATTERN, so we normalize +# by adding the prefix when missing. +# In Docker environment core/jobs/src/nmp/core/jobs/controllers/backends/docker.py, +# tasks are prefixed with `task-` by default: task_id = f"task-{uuid.uuid4().hex}" +def _normalize_task_name(task: str) -> str: + """Ensure task name uses the expected Jobs prefix.""" + if task.startswith("task-"): + return task + return f"task-{task}" + + +@dataclass(frozen=True) +class NMPJobContext: + """NeMo Platform Job context populated from Job Controller environment variables""" + + workspace: str + job_id: str + attempt_id: str + step: str + task: str + + # Service URLs + jobs_url: str | None + files_url: str | None + + # Storage paths + storage_path: Path + config_path: Path + + @property + def normalized_task(self) -> str: + """Task normalized for Jobs API compatibility.""" + return _normalize_task_name(self.task) + + @classmethod + def from_env(cls) -> Self: + """Create a NMPJobContext from environment variables""" + return cls( + workspace=os.environ.get(NEMO_JOB_WORKSPACE_ENVVAR, DEFAULT_WORKSPACE), + job_id=os.environ.get(NEMO_JOB_ID_ENVVAR, DEFAULT_JOB_ID), + attempt_id=os.environ.get(NEMO_JOB_ATTEMPT_ID_ENVVAR, DEFAULT_ATTEMPT_ID), + step=os.environ.get(NEMO_JOB_STEP_ENVVAR, DEFAULT_STEP), + task=os.environ.get(NEMO_JOB_TASK_ENVVAR, DEFAULT_TASK), + jobs_url=os.environ.get(NMP_JOBS_URL_ENVVAR), + files_url=os.environ.get(NMP_FILES_URL_ENVVAR), + storage_path=Path(os.environ.get(PERSISTENT_JOB_STORAGE_PATH_ENVVAR, DEFAULT_JOB_STORAGE_PATH)), + config_path=Path( + os.environ.get(NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH) + ), + ) diff --git a/services/automodel/src/nmp/automodel/app/jobs/file_io/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/file_io/schemas.py new file mode 100644 index 00000000..c6a214fc --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/file_io/schemas.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from enum import StrEnum +from typing import Optional + +from pydantic import BaseModel, Field, model_validator + +FILESET_PROTOCOL = "fileset://" + + +class TaskStatus(StrEnum): + """Status of a file I/O task.""" + + RUNNING = "running" + COMPLETED = "completed" + ERROR = "error" + + +class TaskPhase(StrEnum): + """Phase of a file I/O task.""" + + DOWNLOADING = "downloading" + UPLOADING = "uploading" + COMPLETED = "completed" + + +class FileSetRef(BaseModel): + """Reference to a FileSet.""" + + # workspace is optional because at compile time, the workspace is not known. + # None tells the file_io task to use the job's workspace from the NMPJobContext. + workspace: Optional[str] = None + name: str + + def __str__(self) -> str: + if self.workspace is None: + return f"{self.name}" + return f"{self.workspace}/{self.name}" + + def __repr__(self) -> str: + return f"FileSetRef(workspace={self.workspace}, name={self.name})" + + @classmethod + def _parse_string_parts(cls, ref: str) -> tuple[Optional[str], str] | None: + """Parse a FileSet reference string into a tuple of workspace and name.""" + if len(ref) == 0: + return None + if ref.startswith(FILESET_PROTOCOL): + ref = ref[len(FILESET_PROTOCOL) :] + parts = ref.split("/", 1) + if len(parts) == 1: + return None, parts[0] + if len(parts) == 2: + return parts[0], parts[1] + return None + + @classmethod + def extract_name(cls, ref: str) -> str: + """Extract the fileset/entity name from a reference string. + + Supports: + - workspace/name + - name + - fileset://workspace/name (legacy, stripped) + """ + return cls.model_validate(ref).name + + @model_validator(mode="before") + @classmethod + def _convert_string_input(cls, v: str) -> dict: + """Convert a FileSet reference string into a dict of workspace and name. + + This makes it possible to create a FileSetRef from a string directly. + """ + if isinstance(v, str): + result = cls._parse_string_parts(v) + if result is None: + raise ValueError(f"Invalid FileSet reference: {v}. Expected format: workspace/name") + workspace, name = result + return {"workspace": workspace, "name": name} + return v + + +class DownloadItem(BaseModel): + """Configures a single download: fileset -> local path. + + Note: dest is an absolute path where files will be downloaded. + This path should be under the job's shared storage (e.g., /var/run/scratch/job/model). + """ + + src: FileSetRef = Field( + description="FileSet reference for the source files. " + "Accepts 'workspace/name' or 'name' (job workspace used when omitted)." + ) + dest: str = Field( + default=".", description="Absolute destination path for downloaded files (e.g., '/var/run/scratch/job/model')." + ) + + +class UploadItem(BaseModel): + """Configures a single upload: local path -> fileset.""" + + src: str = Field( + description="Absolute source path for files to upload (e.g., '/var/run/scratch/job/output_model')." + ) + dest: FileSetRef = Field( + description="FileSet reference for the destination. " + "Accepts 'workspace/name' or 'name' (job workspace used when omitted)." + ) + metadata: Optional[dict] = Field( + default=None, + description="Optional metadata to set on the created fileset (e.g., tool_calling config " + "propagated from the source model entity).", + ) + + +class FileIOTaskConfig(BaseModel): + """Configuration for the file_io task. + + Used when running: python -m nmp.automodel.tasks.file_io + """ + + download: list[DownloadItem] = Field(default_factory=list, description="List of FileSets to download.") + upload: list[UploadItem] = Field(default_factory=list, description="List of files to upload to FileSets.") + + +class TaskCompilationError(Exception): + """Error compiling a task configuration.""" + + pass + + +class FileDownloadError(Exception): + """Error downloading files from Files service.""" + + pass + + +class FileUploadError(Exception): + """Error uploading files to Files service.""" + + pass + + +class ProgressReportError(Exception): + """Error reporting progress to the Jobs service.""" + + pass + + +class PathTraversalError(ValueError): + """Error when a path attempts to escape the allowed base directory. + + This is a security error raised when user-provided paths like '../..' would + result in file operations outside the designated storage directory. + """ + + pass + + +@dataclass +class FileStats: + """Statistics for a file operation.""" + + total_bytes: int = 0 + failed_files: int = 0 + + +@dataclass +class DownloadStats(FileStats): + """Statistics for a download operation.""" + + files_downloaded: int = 0 + + +@dataclass +class UploadStats(FileStats): + """Statistics for a upload operation.""" + + files_uploaded: int = 0 diff --git a/services/automodel/src/nmp/automodel/app/jobs/model_entity/__init__.py b/services/automodel/src/nmp/automodel/app/jobs/model_entity/__init__.py new file mode 100644 index 00000000..c5ddfda4 --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/model_entity/__init__.py @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Model entity job configuration.""" + +from .schemas import ModelEntityCreationError, ModelEntityTaskConfig + +__all__ = [ + "ModelEntityCreationError", + "ModelEntityTaskConfig", +] diff --git a/services/automodel/src/nmp/automodel/app/jobs/model_entity/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/model_entity/schemas.py new file mode 100644 index 00000000..b2cd122d --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/model_entity/schemas.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Schemas for the model_entity task configuration.""" + +from typing import Optional + +from nmp.automodel.app.jobs.file_io.schemas import FileSetRef +from nmp.automodel.entities.values import FinetuningType +from pydantic import BaseModel, Field + + +class ToolCallConfig(BaseModel): + """Tool calling configuration for NIM deployments.""" + + tool_call_parser: Optional[str] = Field(default=None, description="Name of the tool call parser to use.") + tool_call_plugin: Optional[str] = Field( + default=None, + pattern=r"^[\w\-.]+/[\w\-.]+$", + description="Reference to a fileset containing the custom tool call plugin Python file. " + "Expected format: '{workspace}/{fileset_name}'.", + ) + auto_tool_choice: Optional[bool] = Field(default=None, description="Whether to enable automatic tool choice.") + + +class DeploymentParameters(BaseModel): + """Inline deployment parameters for creating a new ModelDeploymentConfig.""" + + gpu: int = Field(default=1, description="Number of GPUs required for deployment") + additional_envs: Optional[dict[str, str]] = Field( + default=None, + description="Additional environment variables for deployment", + ) + disk_size: Optional[str] = Field(default=None, description="Disk size for deployment") + image_name: Optional[str] = Field( + default=None, + description="Container image name from NGC. Defaults to multi-llm when unset", + ) + image_tag: Optional[str] = Field(default=None, description="Container image tag from NGC") + lora_enabled: bool = Field( + default=True, + description=( + "When auto-deploying full SFT training, setting this true allows " + "subsequent LoRA adapters to be deployed against the model." + ), + ) + tool_call_config: Optional[ToolCallConfig] = Field( + default=None, + description="Tool calling configuration override for the NIM deployment.", + ) + + +class PEFTConfig(BaseModel): + """PEFT configuration for LoRA and LoRA-merged fine-tuning.""" + + type: FinetuningType + rank: int + alpha: int + + +class ModelEntityTaskConfig(BaseModel): + """Configuration for the model_entity task. + + Used when running: python -m nmp.automodel.tasks.model_entity + """ + + name: str = Field( + description="Name of the model entity to create", + ) + workspace: str = Field( + description="Workspace of the model entity to create", + ) + description: Optional[str] = Field( + default=None, + description="Optional description of the model", + ) + fileset: FileSetRef = Field( + description="FileSet reference containing the customized model artifacts", + ) + model_entity: str = Field(..., description="The model entity this model was based on.") + base_model: Optional[str] = Field( + default=None, + description="Link to the base model used for customization", + ) + peft: Optional[PEFTConfig] = Field( + default=None, + description="PEFT configuration. Set for LoRA/LoRA-merged, None for full SFT.", + ) + + trust_remote_code: bool = Field( + default=False, + description="Whether to trust remote code for the checkpoint, propagated from the source model entity.", + ) + + deployment_config: Optional[str | DeploymentParameters] = Field( + default=None, + description="Deployment configuration. A string references an existing ModelDeploymentConfig " + "by name. An object provides inline NIM deployment parameters. " + "Omit to skip deployment.", + ) + + +class ModelEntityCreationError(Exception): + """Error creating model entity.""" + + pass diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py new file mode 100644 index 00000000..1af931f9 --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py @@ -0,0 +1,399 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Training step compiler.""" + +import logging + +from nemo_platform.types.models.model_entity import ModelEntity +from nmp.automodel.api.v2.jobs.schemas import ( + AnyTraining, + CustomizationJobOutput, + DistillationTraining, + LoRAParams, + MLflowParams, + WandBParams, +) +from nmp.automodel.app.constants import ( + DEFAULT_DATASET_PATH, + DEFAULT_MODEL_PATH, + DEFAULT_TEACHER_MODEL_PATH, + V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, +) +from nmp.automodel.app.jobs.training.schemas import ( + DistillationConfig, + LoRAConfig, + MLflowConfig, + ModelConfig, + TrainingStepConfig, + WandBConfig, +) +from nmp.automodel.config import config +from nmp.automodel.entities.values import Precision, TrainingType +from nmp.automodel.images import AUTOMODEL_PYTHON_ENTRYPOINT, get_training_image +from nmp.common.jobs.api_factory import ( + ContainerSpec, + DistributedGPUExecutionProviderSpec, + EnvironmentVariable, + EnvironmentVariableFromSecret, + GPUExecutionProviderSpec, + PlatformJobStep, + ResourcesSpec, + StepLifecycle, +) +from nmp.common.model_utils import is_embedding_model + +logger = logging.getLogger(__name__) + + +def _resolve_is_embedding_model(me: ModelEntity) -> bool: + """Resolve embedding flag while preserving compatibility with legacy specs.""" + if me.spec is None: + return is_embedding_model(me.name) + + # Do not rely on `me.spec is not None` alone: + # older persisted ModelSpec payloads may not include `is_embedding_model`. + # Pydantic fills missing fields with the default (False), which would + # incorrectly classify legacy embedding models as LLMs. + model_fields_set = getattr(me.spec, "model_fields_set", getattr(me.spec, "__fields_set__", set())) + if "is_embedding_model" not in model_fields_set: + return is_embedding_model(me.name) + + return me.spec.is_embedding_model or False + + +def _resolve_v4_compatible(me: ModelEntity) -> bool: + """Check if the model requires transformers-v4-compatible checkpoint output.""" + if me.spec is None: + return False + checkpoint_model_name = getattr(me.spec, "checkpoint_model_name", None) + is_v4_compatible = checkpoint_model_name in V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + logger.info(f"Checkpoint model name {checkpoint_model_name} is v4 compatible: {is_v4_compatible}") + return is_v4_compatible + + +def _resolve_custom_implementation_override(me: ModelEntity) -> bool: + if me.spec is None: + return False + + checkpoint_model_name = getattr(me.spec, "checkpoint_model_name", None) + if checkpoint_model_name == "NemotronHForCausalLM" and getattr(me.spec, "moe_config", None) is None: + # V2 Model is being used, v3 uses MoE - However V2 gets recognized as V3 and fails + return True + + if ( + checkpoint_model_name == "MistralForCausalLM" + and getattr(me.spec, "family", None) == "mistral" + and getattr(me.spec, "is_chat", False) + ): + # Mistral 7b v0.3 Instruct has the custom tokenizer implementation fail with: + """2026-03-02 18:35:51 | INFO | root | Using model config to instantiate tokenizer + 2026-03-02 18:35:53 | INFO | nemo_automodel._transformers.auto_tokenizer | Using custom tokenizer MistralCommonBackend for model type 'mistral' + 2026-03-02 18:35:53 | WARNING | nemo_automodel._transformers.tokenization.tokenization_mistral_common | Multiple tokenizer files found in directory: /var/run/scratch/job/model. Using tokenizer.model.v3. + Instantiation failed for `ColumnMappedTextInstructionDataset` + Accepted signature : (path_or_dataset_id: Union[str, List[str]], column_mapping: Dict[str, str], tokenizer, *, split: Optional[str] = 'train', name: Optional[str] = None, answer_only_loss_mask: bool = True, seq_length: Optional[int] = None, padding: Union[str, bool] = 'do_not_pad', truncation: Union[str, bool] = 'do_not_truncate', limit_dataset_samples: Optional[int] = None, use_hf_chat_template: bool = False) -> None + Positional args : () + Keyword args : { 'answer_only_loss_mask': True, + 'column_mapping': {'answer': 'completion', 'question': 'prompt'}, + 'padding': 'do_not_pad', + 'path_or_dataset_id': '/run/scratch/job/training/dataset/train.jsonl', + 'seq_length': 1024, + 'split': 'train', + 'tokenizer': '******', + 'truncation': 'longest_first'} + Exception : piece id is out of range. + """ + return True + + return False + + +def compile_training_step( + job_spec: CustomizationJobOutput, + base_env: list[EnvironmentVariable], + me: ModelEntity, + teacher_me: ModelEntity | None = None, +) -> PlatformJobStep: + """Compile job input to a PlatformJobStep for training. + + Args: + job_spec: The customization job output specification. + base_env: Base environment variables for the job step. + me: The model entity being trained. + teacher_me: Optional teacher model entity for knowledge distillation jobs. + + """ + job_spec.validate_for_training() + if TrainingType(job_spec.training.type) == TrainingType.DPO: + raise ValueError("DPO training is not supported by nmp-automodel") + trust_remote_code = me.trust_remote_code or False + chat_template = me.spec.chat_template if me.spec else None + is_embedding_model = _resolve_is_embedding_model(me) + override_custom_impl = _resolve_custom_implementation_override(me) + v4_compatible = _resolve_v4_compatible(me) + training = job_spec.training + p = training.parallelism + num_gpus_per_node = p.num_gpus_per_node + + training_config = TrainingStepConfig( + model=_translate_model_config( + job_spec, + DEFAULT_MODEL_PATH, + trust_remote_code=trust_remote_code, + is_embedding_model=is_embedding_model, + chat_template=chat_template, + override_custom_impl=override_custom_impl, + v4_compatible=v4_compatible, + ), + dataset=TrainingStepConfig.DatasetConfig( + path=DEFAULT_DATASET_PATH, + ), + training=_translate_training_config(training, me, teacher_me=teacher_me), + schedule=TrainingStepConfig.ScheduleConfig( + epochs=training.epochs, + max_steps=training.max_steps, + val_check_interval=training.val_check_interval, + ), + batch=TrainingStepConfig.BatchConfig( + global_batch_size=training.batch_size, + micro_batch_size=training.micro_batch_size, + sequence_packing=training.sequence_packing, + ), + optimizer=TrainingStepConfig.OptimizerConfig( + learning_rate=training.learning_rate, + min_learning_rate=training.min_learning_rate, + weight_decay=training.weight_decay, + beta1=training.adam_beta1, + beta2=training.adam_beta2, + warmup_steps=training.warmup_steps, + ), + parallelism=TrainingStepConfig.ParallelismConfig( + num_nodes=p.num_nodes, + num_gpus_per_node=num_gpus_per_node, + tensor_parallel_size=p.tensor_parallel_size, + pipeline_parallel_size=p.pipeline_parallel_size, + context_parallel_size=p.context_parallel_size, + expert_parallel_size=p.expert_parallel_size, + sequence_parallel=p.sequence_parallel, + ), + integrations=_translate_integrations(job_spec), + output_model=job_spec.output.name, + ) + + container = ContainerSpec( + image=_get_training_image(), + entrypoint=AUTOMODEL_PYTHON_ENTRYPOINT, + command=["-m", "nmp.automodel.tasks.training"], + ) + + profile = ( + training.execution_profile + if training.execution_profile is not None + else config.default_training_execution_profile + ) + + if p.num_nodes > 1: + logger.debug(f"Using distributed GPU executor: num_nodes={p.num_nodes}, num_gpus_per_node={num_gpus_per_node}") + executor = DistributedGPUExecutionProviderSpec( + provider="gpu_distributed", + profile=profile, + container=container, + resources=ResourcesSpec( + num_gpus=num_gpus_per_node, + num_nodes=p.num_nodes, + ), + ) + else: + logger.debug(f"Using single-node GPU executor: num_gpus={num_gpus_per_node}") + executor = GPUExecutionProviderSpec( + provider="gpu", + profile=profile, + container=container, + resources=ResourcesSpec( + num_gpus=num_gpus_per_node, + ), + ) + + secret_envs = _collect_integration_secret_envs(job_spec) + + return PlatformJobStep( + name="customization-training-job", + executor=executor, + environment=[*base_env, *secret_envs, EnvironmentVariable(name="HF_DATASETS_OFFLINE", value="1")], + config=training_config.model_dump(mode="json"), + lifecycle=StepLifecycle(staleness_timeout_seconds=config.training_staleness_timeout_seconds), + ) + + +def _translate_model_config( + job_spec: CustomizationJobOutput, + path: str, + trust_remote_code: bool = False, + is_embedding_model: bool = False, + chat_template: str | None = None, + override_custom_impl: bool = False, + v4_compatible: bool = False, +) -> ModelConfig: + """Translate job spec to internal ModelConfig.""" + training = job_spec.training + return ModelConfig( + path=path, + name=_extract_model_name(job_spec), + max_seq_length=training.max_seq_length, + precision=training.precision, + trust_remote_code=trust_remote_code, + is_embedding_model=is_embedding_model, + chat_template=chat_template, + override_custom_impl=override_custom_impl, + v4_compatible=v4_compatible, + ) + + +def _translate_training_config( + training: AnyTraining, + me: ModelEntity, + teacher_me: ModelEntity | None = None, +) -> TrainingStepConfig.TrainingConfig: + """Translate API training method to internal TrainingConfig. + + Args: + training: The API training configuration. + me: The primary model entity. + teacher_me: Teacher model entity, populated for distillation jobs. + """ + training_type = TrainingType(training.type) + lora = _translate_lora_config(training.peft, me) if isinstance(training.peft, LoRAParams) else None + + kd = None + if isinstance(training, DistillationTraining): + teacher_trust_remote_code = (teacher_me.trust_remote_code or False) if teacher_me else False + kd = DistillationConfig( + teacher_model=ModelConfig( + path=DEFAULT_TEACHER_MODEL_PATH, + name=training.teacher_model, + precision=Precision(training.teacher_precision), + trust_remote_code=teacher_trust_remote_code, + ), + ratio=training.distillation_ratio, + temperature=training.distillation_temperature, + ) + + return TrainingStepConfig.TrainingConfig( + training_type=training_type, + finetuning_type=training.finetuning_type, + lora=lora, + kd=kd, + ) + + +def _translate_lora_config(api_lora: LoRAParams, me: ModelEntity) -> LoRAConfig: + """Translate API LoRAConfig to internal LoRAConfig.""" + lora = LoRAConfig( + rank=api_lora.rank, + alpha=api_lora.alpha, + dropout=api_lora.dropout, + target_modules=api_lora.target_modules, + use_triton=True, + ) + + if not lora.target_modules: + if me.spec and me.spec.checkpoint_model_name == "NemotronHForCausalLM": + # Need to remove out_proj from the list of target modules + modules = set() + if me.spec.linear_layers: + for ll in me.spec.linear_layers: + m = ll.name.split(".")[-1] + if m.endswith("proj"): + modules.add(f"*.{m}") + modules.discard("*.out_proj") + + # In cases when model_spec has linear_layers as null, we need to set the target_modules to default + # If target_modules is empty we get this error during training: + # Expected match_all_linear to be true or target_modules/exclude_modules to be non-empty + lora.target_modules = list(modules) if modules else ["*proj"] + else: + lora.target_modules = ["*proj"] + return lora + + +def _translate_wandb_config(api_wandb: WandBParams | None) -> WandBConfig | None: + """Translate API WandBParams to internal WandBConfig.""" + if api_wandb is None: + return None + + return WandBConfig( + project=api_wandb.project, + name=api_wandb.name, + entity=api_wandb.entity, + tags=api_wandb.tags, + notes=api_wandb.notes, + base_url=api_wandb.base_url, + ) + + +def _translate_mlflow_config(api_mlflow: MLflowParams | None) -> MLflowConfig | None: + """Translate API MLflowParams to internal MLflowConfig.""" + if api_mlflow is None: + return None + + return MLflowConfig( + experiment_name=api_mlflow.experiment_name, + run_name=api_mlflow.run_name, + tags=api_mlflow.tags, + description=api_mlflow.description, + tracking_uri=api_mlflow.tracking_uri, + ) + + +def _translate_integrations(job_spec: CustomizationJobOutput) -> TrainingStepConfig.IntegrationsConfig: + """Translate API IntegrationsConfig to internal IntegrationsConfig.""" + if not job_spec.integrations: + return TrainingStepConfig.IntegrationsConfig() + + return TrainingStepConfig.IntegrationsConfig( + wandb=_translate_wandb_config(job_spec.integrations.wandb), + mlflow=_translate_mlflow_config(job_spec.integrations.mlflow), + ) + + +def _collect_integration_secret_envs(job_input: CustomizationJobOutput) -> list[EnvironmentVariable]: + """Collect secret environment variables from integration configs. + + Secrets are propagated via PlatformJobStep.environment (not config) so that + the Jobs service can resolve secret references at runtime. + """ + secret_envs: list[EnvironmentVariable] = [] + if not job_input.integrations: + return secret_envs + + if job_input.integrations.wandb and job_input.integrations.wandb.api_key_secret: + secret_envs.append( + EnvironmentVariable( + name="WANDB_API_KEY", + from_secret=EnvironmentVariableFromSecret( + name=job_input.integrations.wandb.api_key_secret.root, + ), + ) + ) + + return secret_envs + + +def _extract_model_name(job_spec: CustomizationJobOutput) -> str | None: + """Extract the canonical model name from the model field for template lookup. + + The model name follows the pattern "workspace/name" (e.g., "meta/llama-3.1-8b-instruct") + which matches the keys in DEFAULT_CHAT_TEMPLATES. + """ + model = job_spec.model + + if "/" in model: + logger.debug(f"Extracted model name from URN: {model}") + return model + + return None + + +def _get_training_image() -> str: + """Training container image for the Automodel task.""" + return config.training_automodel_image or get_training_image() diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py new file mode 100644 index 00000000..93d8ae7f --- /dev/null +++ b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py @@ -0,0 +1,293 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from enum import Enum +from typing import Optional + +from nmp.automodel.app.constants import ( + DEFAULT_OUTPUT_MODEL_PATH, + DEFAULT_SEED, + DEFAULT_TRAINING_OUTPUT_PATH, +) +from nmp.automodel.entities.values import CheckpointFormat, FinetuningType, Precision, TrainingType +from pydantic import BaseModel, Field + + +class OptimizerType(str, Enum): + """Optimizer and scheduler combination types.""" + + ADAMW_WITH_COSINE_ANNEALING = "adamw_with_cosine_annealing" + ADAM_WITH_COSINE_ANNEALING = "adam_with_cosine_annealing" + ADAMW_WITH_FLAT_LR = "adamw_with_flat_lr" + ADAM_WITH_FLAT_LR = "adam_with_flat_lr" + + +class LoRAConfig(BaseModel): + """Internal LoRA configuration with implementation details. + + This differs from the API LoRAParams: + - Includes use_triton, match_all_linear (implementation details) + - exclude_modules for advanced control + - Can add new fields freely without breaking API + """ + + # Core LoRA parameters (from API) + rank: int = Field(default=8, description="LoRA rank (low-rank dimension)") + alpha: int = Field(default=32, description="LoRA alpha scaling factor") + dropout: float = Field(default=0.0, description="LoRA dropout probability") + + # Module targeting + target_modules: Optional[list[str]] = Field( + default=None, description="Module name patterns to apply LoRA to (e.g., ['*.proj'])" + ) + exclude_modules: Optional[list[str]] = Field(default=None, description="Module name patterns to exclude from LoRA") + + # Implementation details (not in API) + use_triton: bool = Field(default=True, description="Use optimized Triton LoRA kernel") + + +class ModelConfig(BaseModel): + """Internal model configuration.""" + + path: str = Field(description="Path to a model directory (contains config, weights, tokenizer etc.)") + name: Optional[str] = Field( + default=None, + description="Model identifier (e.g., 'meta/llama-3.1-8b-instruct')", + ) + max_seq_length: int = Field( + default=2048, + description="Maximum token sequence length for training; longer sequences are truncated", + ) + + # Model loading options + precision: Optional[Precision] = Field( + default=None, + description="Model weight dtype (e.g., 'bf16', 'fp16'). None implies auto-detects from model config", + ) + attn_implementation: Optional[str] = Field( + default="sdpa", + description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2' (requires flash-attn), 'eager' (no optimization)", + ) + trust_remote_code: bool = Field( + default=False, + description="Allow executing custom model code from the checkpoint. Required for some community models", + ) + is_embedding_model: bool = Field( + default=False, + description="Whether the model is an embedding model", + ) + chat_template: Optional[str] = Field( + default=None, + description="Jinja2 chat template from the model entity spec or fileset metadata. " + "Takes highest priority in resolve_chat_template when set.", + ) + + override_custom_impl: bool = Field( + default=False, + description="Some of the custom implementations in nemo automodel cause loading failures when used with other models in the same family, this forces the use_hf=True flag to use non custom implementations.", + ) + + v4_compatible: bool = Field( + default=False, + description="Enable transformers-v4-compatible checkpoint output that preserves the original transformers-v4-style config.json output.", + ) + + +class DistillationConfig(BaseModel): + """Internal Knowledge Distillation configuration. + + teacher is a ModelConfig with resolved path, not a URN. + """ + + # Teacher model (resolved path) + teacher_model: ModelConfig = Field(description="Teacher model configuration with resolved path") + + # KD hyperparameters + ratio: float = Field(default=0.5, description="Balance between CE loss and KD loss") + temperature: float = Field(default=1.0, description="Softmax temperature for KD") + + # Implementation detail (not in API) + offload_teacher: bool = Field(default=False, description="Offload teacher model to CPU for memory efficiency") + + +class EmbeddingConfig(BaseModel): + """Internal Embedding/Biencoder model finetuning configuration. + + This is used internally when a model is detected as an embedding model + by its name. The defaults here match the recommended settings for + NeMo embedding models. + + Note: Embedding models are detected by model name (e.g., contains 'embed'), + not by a separate training type. They use standard SFT training type. + + Model architecture parameters (share_encoder, pooling, l2_normalize, temperature, + add_linear_pooler, out_dimension) use sensible defaults and are not exposed here. + """ + + # Training configuration + train_n_passages: int = Field( + default=5, + description=( + "Total number of passages per query during training: 1 positive + (n-1) negatives. " + "For example, train_n_passages=5 means 1 positive and 4 negative passages per query." + ), + ) + eval_negative_size: Optional[int] = Field( + default=None, + description=( + "Number of negative passages per query during validation. " + "Recommended to keep as train_n_passages - 1 for consistent train/eval behavior. " + "If not set, defaults to train_n_passages - 1." + ), + ) + + # Memory optimization + do_gradient_checkpointing: bool = Field( + default=False, + description=( + "Enable gradient checkpointing to reduce memory usage at the cost of slower training. " + "Useful for larger embedding models or memory-constrained environments." + ), + ) + + # Tokenization configuration + query_max_length: int = Field(default=512, description="Maximum token length for query tokenization") + passage_max_length: int = Field(default=512, description="Maximum token length for passage tokenization") + query_prefix: str = Field(default="query:", description="Prefix to prepend to queries before tokenization") + passage_prefix: str = Field(default="passage:", description="Prefix to prepend to passages before tokenization") + + +class WandBConfig(BaseModel): + """Internal Weights & Biases configuration.""" + + project: Optional[str] = Field(default=None, description="W&B project name") + name: Optional[str] = Field(default=None, description="W&B run name") + entity: Optional[str] = Field(default=None, description="W&B entity") + tags: Optional[list[str]] = Field(default=None, description="W&B tags") + notes: Optional[str] = Field(default=None, description="W&B notes") + base_url: Optional[str] = Field(default=None, description="Self-hosted W&B server URL") + + +class MLflowConfig(BaseModel): + """Internal MLflow configuration.""" + + experiment_name: Optional[str] = Field(default=None, description="MLflow experiment name") + run_name: Optional[str] = Field(default=None, description="MLflow run name") + tags: Optional[dict[str, str]] = Field(default=None, description="MLflow tags") + description: Optional[str] = Field(default=None, description="MLflow description") + tracking_uri: Optional[str] = Field(default=None, description="MLflow tracking URI") + + +class TrainingStepConfig(BaseModel): + """Normalized training configuration compiled into nemo-automodel recipe YAML.""" + + class DatasetConfig(BaseModel): + path: str + prompt_template: Optional[str] = None + add_bos: Optional[bool] = None + add_eos: Optional[bool] = None + + class TrainingConfig(BaseModel): + training_type: TrainingType + finetuning_type: Optional[FinetuningType] = None + lora: Optional[LoRAConfig] = None + kd: Optional[DistillationConfig] = None + + class ScheduleConfig(BaseModel): + epochs: int = 1 + max_steps: Optional[int] = None + val_check_interval: Optional[float] = None + + class BatchConfig(BaseModel): + global_batch_size: int = Field(default=32, gt=0) + micro_batch_size: int = Field(default=1, gt=0) + sequence_packing: bool = False + sequence_packing_max_samples: int = 1000 + + class OptimizerConfig(BaseModel): + optimizer_type: Optional[OptimizerType] = Field(default=None) + learning_rate: float = 1e-4 + min_learning_rate: Optional[float] = None + eps: float = 1e-8 + weight_decay: float = 0.01 + beta1: float = 0.9 + beta2: float = 0.999 + warmup_steps: int = 0 + + class ParallelismConfig(BaseModel): + num_nodes: int = 1 + num_gpus_per_node: int = 1 + tensor_parallel_size: int = 1 + pipeline_parallel_size: int = 1 + context_parallel_size: int = 1 + expert_parallel_size: Optional[int] = None + sequence_parallel: bool = False + + class IntegrationsConfig(BaseModel): + wandb: Optional[WandBConfig] = None + mlflow: Optional[MLflowConfig] = None + + # === Main Config Fields === + model: ModelConfig + dataset: DatasetConfig + training: TrainingConfig + schedule: ScheduleConfig + batch: BatchConfig + optimizer: OptimizerConfig + parallelism: ParallelismConfig + integrations: IntegrationsConfig = Field(default_factory=IntegrationsConfig) + + # === Output Paths === + output_model: str # Set at compile-time from CustomizationJobOutput + workspace_path: str = Field(default=DEFAULT_TRAINING_OUTPUT_PATH) + output_path: str = Field(default=DEFAULT_OUTPUT_MODEL_PATH) + + # === Miscellaneous === + seed: int = Field( + default=DEFAULT_SEED, description="Random seed for ensuring reproducibility in all random processes." + ) + training_timeout: Optional[int] = None + + +class GPUInfo(BaseModel): + """GPU architecture information captured during training.""" + + architecture: str + device_name: str + memory_gb: float + cuda_version: str + + +class CheckpointInfo(BaseModel): + """Output checkpoint information.""" + + path: str + format: CheckpointFormat + precision: Optional[Precision] = Field( + default=None, description="Checkpoint precision. None when auto-detected from model config." + ) + + +class TrainingMetrics(BaseModel): + """Final training metrics.""" + + final_loss: Optional[float] = None + final_val_loss: Optional[float] = None + best_val_loss: Optional[float] = None + total_steps: int = 0 + total_epochs: int = 0 + + +class TrainingResult(BaseModel): + """ + Result written by training task. + + Written to: {workspace_path}/training_result.json + """ + + success: bool + error_message: Optional[str] = None + checkpoint: Optional[CheckpointInfo] = None + gpu_info: Optional[GPUInfo] = None + metrics: TrainingMetrics = Field(default_factory=TrainingMetrics) + training_duration_seconds: Optional[float] = None diff --git a/services/automodel/src/nmp/automodel/compile.py b/services/automodel/src/nmp/automodel/compile.py new file mode 100644 index 00000000..2781fb9e --- /dev/null +++ b/services/automodel/src/nmp/automodel/compile.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Public compile entrypoint for Automodel jobs.""" + +from __future__ import annotations + +from nmp.automodel.adapter import automodel_spec_to_compiler_output +from nmp.automodel.api.v2.jobs.schemas import CustomizationJobOutput +from nmp.automodel.app.jobs.compiler import platform_job_config_compiler as _compile_canonical + + +async def platform_job_config_compiler( + job_spec: CustomizationJobOutput | object, + workspace: str, + sdk: object, + job_name: str | None = None, + profile: str | None = None, +) -> object: + """Compile Automodel job spec (plugin or legacy shape) to PlatformJobSpec.""" + if not isinstance(job_spec, CustomizationJobOutput): + job_spec = automodel_spec_to_compiler_output(job_spec) + if profile and job_spec.training.execution_profile is None: + job_spec = job_spec.model_copy( + update={"training": job_spec.training.model_copy(update={"execution_profile": profile})}, + ) + return await _compile_canonical( + workspace, + job_spec, + sdk, # type: ignore[arg-type] + ) + + +__all__ = ["platform_job_config_compiler", "automodel_spec_to_compiler_output"] diff --git a/services/automodel/src/nmp/automodel/config.py b/services/automodel/src/nmp/automodel/config.py new file mode 100644 index 00000000..4a698993 --- /dev/null +++ b/services/automodel/src/nmp/automodel/config.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Configuration for the nmp-automodel compiler and tasks.""" + +from nmp.common.config import create_service_config_class, get_platform_config, get_service_config +from pydantic import Field + + +class AutomodelConfig(create_service_config_class("automodel")): # type: ignore + """Environment variables use the NMP_AUTOMODEL_ prefix.""" + + image_registry: str = Field( + default="nvcr.io/0921617854601259/nemo-platform-dev", + description=( + "Registry host/path prefix for nmp-automodel-tasks and nmp-automodel-training. " + "Override via NMP_AUTOMODEL_IMAGE_REGISTRY for other environments." + ), + ) + training_image: str | None = Field( + default=None, + description="Override GPU training image (default: nmp-automodel-training under image_registry).", + ) + tasks_image: str | None = Field( + default=None, + description="Override CPU tasks image (default: nmp-automodel-tasks under image_registry).", + ) + + default_job_resource_cpu_request: str = Field(default="1") + default_job_resource_memory_request: str = Field(default="8Gi") + default_job_resource_cpu_limit: str = Field(default="4") + default_job_resource_memory_limit: str = Field(default="16Gi") + + training_staleness_timeout_seconds: int = Field( + default=3600, + description="Terminate training if no task progress within this many seconds (0 disables).", + ) + + default_training_execution_profile: str = Field( + default="gpu", + description="Default GPU execution profile when the job spec omits training.execution_profile.", + ) + + +config = get_service_config(AutomodelConfig) +platform_config = get_platform_config() + +# Legacy compiler attribute names +config.training_automodel_image = config.training_image diff --git a/services/automodel/src/nmp/automodel/entities/__init__.py b/services/automodel/src/nmp/automodel/entities/__init__.py new file mode 100644 index 00000000..13e49b93 --- /dev/null +++ b/services/automodel/src/nmp/automodel/entities/__init__.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Customizer entity definitions. + +This module exports: +- Entity classes (database/persistence models) +- Shared value types (enums and read-only metadata) + +Configuration types (LoRAConfig, ModelConfig, etc.) are NOT exported here. +They belong in their respective layers: +- API types → api/v2/jobs/schemas.py +- Internal types → app/jobs/training/schemas.py +""" + +from .values import ( + CheckpointFormat, + FinetuningType, + Precision, + TrainingType, +) + +__all__ = [ + # Enums + "CheckpointFormat", + "FinetuningType", + "Precision", + "TrainingType", +] diff --git a/services/automodel/src/nmp/automodel/entities/validators.py b/services/automodel/src/nmp/automodel/entities/validators.py new file mode 100644 index 00000000..b4c97054 --- /dev/null +++ b/services/automodel/src/nmp/automodel/entities/validators.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Shared validation logic for entity fields.""" + +import re +from typing import Optional + +from nmp.automodel.app.jobs.file_io.schemas import FILESET_PROTOCOL, FileSetRef +from nmp.common.entities.constants import REGEX_WORD_CHARACTER_DOT_DASH + +_NAME_REGEX = re.compile(REGEX_WORD_CHARACTER_DOT_DASH) +_UNSUPPORTED_PROTOCOLS = ("hf://", "ngc://", "s3://", "gs://") + + +def _normalize_fileset_ref(uri: str) -> str: + """Parse and return canonical fileset reference (no ``fileset://`` prefix).""" + normalized = uri.strip() + for prefix in _UNSUPPORTED_PROTOCOLS: + if normalized.startswith(prefix): + raise ValueError( + f"Unsupported dataset URI protocol. Use 'workspace/name' or 'name' (resolved in the job workspace). Got: {uri}", + ) + if normalized.startswith(FILESET_PROTOCOL): + normalized = normalized[len(FILESET_PROTOCOL) :] + ref = FileSetRef.model_validate(normalized) + if not _NAME_REGEX.match(ref.name): + raise ValueError( + f"Invalid dataset name: '{ref.name}'. Entity names must contain only word characters, dots, and hyphens.", + ) + return str(ref) + + +def validate_fileset_uri(uri: str) -> str: + """Validate a fileset reference as ``workspace/name`` or ``name``. + + The job path ``workspace`` is used when the reference is a bare name. + A legacy ``fileset://`` prefix is accepted and stripped. + """ + return _normalize_fileset_ref(uri) + + +def validate_optional_fileset_uri(uri: Optional[str]) -> Optional[str]: + """Validate fileset reference, allowing None.""" + if uri is None: + return None + return validate_fileset_uri(uri) diff --git a/services/automodel/src/nmp/automodel/entities/values.py b/services/automodel/src/nmp/automodel/entities/values.py new file mode 100644 index 00000000..b236ac9b --- /dev/null +++ b/services/automodel/src/nmp/automodel/entities/values.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Value types for the Customizer service.""" + +from enum import Enum, StrEnum + + +class CheckpointFormat(str, Enum): + """Model checkpoint format (input or output).""" + + HF = "hf" # Standard HuggingFace format + HF_PEFT = "hf-peft" # HuggingFace PEFT adapter (LoRA, etc.) + NEMO = "nemo" # NeMo checkpoint format + + +class Precision(str, Enum): + """Model precision for training.""" + + FP8 = "fp8" + BF16 = "bf16" + FP16 = "fp16" + FP32 = "fp32" + + def to_torch_dtype(self) -> str: + """ + Convert to a torch dtype string compatible with HuggingFace/Automodel. + + Returns: + String like "bfloat16", "float16", "float32" that can be passed to + from_pretrained(torch_dtype=...) or Automodel's dtype_from_str(). + + Raises: + ValueError: If this precision cannot be represented as a torch dtype. + FP8 requires separate quantization config, BF16_MIXED is a training mode. + """ + mapping = { + Precision.BF16: "bfloat16", + Precision.FP16: "float16", + Precision.FP32: "float32", + } + if self not in mapping: + raise ValueError( + f"Precision '{self.value}' cannot be converted to a torch dtype. " + f"Supported: {[p.value for p in mapping.keys()]}. " + f"Note: FP8 requires separate quantization config, BF16_MIXED is a training mode." + ) + return mapping[self] + + @classmethod + def from_hf_dtype(cls, hf_dtype: str) -> "Precision": + """ + Create Precision from a HuggingFace torch_dtype string. + + Args: + hf_dtype: String like "bfloat16", "float16", "float32", "float". + + Returns: + Corresponding Precision enum value. + + Raises: + ValueError: If the dtype string is not recognized. + """ + mapping = { + "bfloat16": cls.BF16, + "float16": cls.FP16, + "float32": cls.FP32, + "float": cls.FP32, + } + if hf_dtype not in mapping: + raise ValueError(f"Unknown HuggingFace dtype '{hf_dtype}'. Supported: {list(mapping.keys())}") + return mapping[hf_dtype] + + +class TrainingType(str, Enum): + """Training algorithm type.""" + + SFT = "sft" + DISTILLATION = "distillation" + DPO = "dpo" + GRPO = "grpo" + + +class FinetuningType(str, Enum): + """Finetuning strategy (full weights vs PEFT).""" + + ALL_WEIGHTS = "all_weights" + LORA = "lora" + LORA_MERGED = "lora_merged" + + +class OutputNameType(StrEnum): + """Output artifact type.""" + + ADAPTER = "adapter" + MODEL = "model" diff --git a/services/automodel/src/nmp/automodel/images.py b/services/automodel/src/nmp/automodel/images.py new file mode 100644 index 00000000..9f782b28 --- /dev/null +++ b/services/automodel/src/nmp/automodel/images.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Docker image resolution for nmp-automodel job steps.""" + +from __future__ import annotations + +from nmp.automodel.config import config +from nmp.common.jobs.image import get_qualified_image + +# Default NGC dev registry for platform-built automodel images (flat repo names for NVCR). +DEFAULT_AUTOMODEL_IMAGE_REGISTRY = "nvcr.io/0921617854601259/nemo-platform-dev" + +BASE_IMAGE_NAME = "nmp-automodel-base" +TASKS_IMAGE_NAME = "nmp-automodel-tasks" +TRAINING_IMAGE_NAME = "nmp-automodel-training" + +# Must match ENTRYPOINT in Dockerfile.nmp-automodel-{tasks,training}. +# Job specs must set this explicitly: Docker API create() replaces the image +# entrypoint when the platform passes entrypoint=[]. +AUTOMODEL_PYTHON_ENTRYPOINT = ["/opt/venv/bin/python"] + + +def get_automodel_qualified_image(name: str, override: str | None = None) -> str: + """Resolve a job step image reference. + + Args: + name: Image repository name under the registry (e.g. ``nmp-automodel-tasks``). + override: Full image ref from ``NMP_AUTOMODEL_TASKS_IMAGE`` / ``NMP_AUTOMODEL_TRAINING_IMAGE``. + + Returns: + Fully qualified image (``{registry}/{name}:{tag}``) unless ``override`` is set. + """ + if override: + return override + registry = config.image_registry or DEFAULT_AUTOMODEL_IMAGE_REGISTRY + return get_qualified_image(name, registry=registry) + + +def get_tasks_image() -> str: + """CPU task steps (file_io, model_entity).""" + return get_automodel_qualified_image(TASKS_IMAGE_NAME, config.tasks_image) + + +def get_training_image() -> str: + """GPU training step.""" + return get_automodel_qualified_image(TRAINING_IMAGE_NAME, config.training_image) diff --git a/services/automodel/src/nmp/automodel/platform_client.py b/services/automodel/src/nmp/automodel/platform_client.py new file mode 100644 index 00000000..d55672d1 --- /dev/null +++ b/services/automodel/src/nmp/automodel/platform_client.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from nemo_platform import AsyncNeMoPlatform +from nemo_platform._exceptions import NotFoundError, PermissionDeniedError +from nemo_platform.types.models import ModelEntity +from nmp.automodel.app.jobs.file_io.schemas import FileSetRef +from nmp.common.entities.utils import parse_entity_ref + + +async def check_dataset_access(sdk: AsyncNeMoPlatform, dataset_uri: str, default_workspace: str) -> None: + """Verify the caller can access the dataset fileset.""" + ref = FileSetRef.model_validate(dataset_uri) + workspace = ref.workspace or default_workspace + try: + await sdk.files.filesets.retrieve(workspace=workspace, name=ref.name) + except PermissionDeniedError: + raise PermissionError(f"Access denied to dataset fileset '{workspace}/{ref.name}'") from None + except NotFoundError: + raise ValueError( + f"Dataset fileset '{ref.name}' not found in workspace '{workspace}'. Verify the dataset exists." + ) from None + + +async def fetch_model_entity( + model_ref: str, + default_workspace: str, + sdk: AsyncNeMoPlatform, +) -> ModelEntity: + """Retrieve a model entity by reference string.""" + resolved_ref = parse_entity_ref(model_ref, default_workspace) + try: + return await sdk.models.retrieve(name=resolved_ref.name, workspace=resolved_ref.workspace, verbose=True) + except PermissionDeniedError: + raise PermissionError(f"Access denied to model '{resolved_ref.workspace}/{resolved_ref.name}'") from None + except NotFoundError: + raise ValueError( + f"Model entity not found: '{resolved_ref.workspace}/{resolved_ref.name}'. Verify the model entity exists." + ) from None diff --git a/services/automodel/src/nmp/automodel/tasks/__init__.py b/services/automodel/src/nmp/automodel/tasks/__init__.py new file mode 100644 index 00000000..63c71371 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/__init__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Automodel task entrypoints (see ``nmp.automodel.tasks.`` subpackages).""" diff --git a/services/automodel/src/nmp/automodel/tasks/__main__.py b/services/automodel/src/nmp/automodel/tasks/__main__.py new file mode 100644 index 00000000..6e6e8654 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/__main__.py @@ -0,0 +1,48 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Default entrypoint for the nmp-automodel-tasks image (help / task listing). + +Production job steps invoke a specific module directly, e.g. +``python -m nmp.automodel.tasks.file_io``. +""" + +from __future__ import annotations + +import argparse +import sys + +_TASK_MODULES = ( + ("file_io", "nmp.automodel.tasks.file_io", "Download/upload model and dataset files"), + ("model_entity", "nmp.automodel.tasks.model_entity", "Create output model entity"), +) + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser( + prog="python -m nmp.automodel.tasks", + description="NeMo Automodel CPU task image. The jobs compiler runs one module per step.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="Examples:\n" + " python -m nmp.automodel.tasks --help\n" + " python -m nmp.automodel.tasks.file_io\n" + " python -m nmp.automodel.tasks.model_entity\n\n" + "GPU training uses the nmp-automodel-training image:\n" + " python -m nmp.automodel.tasks.training\n", + ) + parser.add_argument( + "--list", + action="store_true", + help="List task modules and exit (default when no job config is provided).", + ) + args = parser.parse_args(argv) + if args.list or len(argv or sys.argv[1:]) == 0: + print("Task modules:\n") + for name, module, summary in _TASK_MODULES: + print(f" {name:14} {module}") + print(f" {summary}\n") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/services/automodel/src/nmp/automodel/tasks/docker/README.md b/services/automodel/src/nmp/automodel/tasks/docker/README.md new file mode 100644 index 00000000..b5797465 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/docker/README.md @@ -0,0 +1,74 @@ +# File I/O Task Docker Testing + +Scripts for running the file_io task container locally. + +## Prerequisites + +1. **Build the Docker image** from the repository root: +This will build `my-registry/nmp-cpu-tasks:local` image that will be used for this task. + + ```bash + cd /path/to/nmp + make docker/nmp-cpu-tasks + ``` + +2. **Have NeMo Platform running** (files service) at `http://localhost:8080` + +## Quick Start + +### Run with Docker Compose + +```bash +cd services/customizer/src/nmp/customizer/tasks/file_io/docker + +# Run the task +docker compose up + +# Run with custom image +FILE_IO_IMAGE=my-registry/nmp-cpu-tasks:dev docker compose up + +# Run interactively +docker compose run --rm file-io run task --task nmp.customizer.tasks.file_io +``` + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `NMP_BASE_URL` | Base URL for NeMo Platform | `http://host.docker.internal:8000` | +| `NMP_FILES_URL` | Files service URL | `http://host.docker.internal:8000` | +| `NMP_JOBS_URL` | Jobs service URL (for progress) | `http://host.docker.internal:8000` | +| `NEMO_JOB_ID` | Job identifier | `test-file-io-job` | +| `NEMO_JOB_STEP` | Step name | `FileIO` | +| `NEMO_JOB_TASK` | Task identifier | `file-io-task` | +| `NEMO_JOB_WORKSPACE` | Workspace name | `default` | +| `LOG_LEVEL` | Logging level | `INFO` | +| `FILE_IO_IMAGE` | Docker image to use | `my-registry/nmp-cpu-tasks:local` | + +### Config File Format + +The `sample_config.json` defines what files to upload/download: + +```json +{ + "upload": [ + { + "src": "local_folder", + "dest": "workspace/fileset-name" + } + ], + "download": [ + { + "src": "workspace/fileset-name", + "dest": "local_folder" + } + ] +} +``` + +- `upload[].src`: Path relative to job storage defined by NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH (mounted at `/var/run/scratch`) +- `upload[].dest`: Target FileSet in format `workspace/fileset-name` +- `download[].src`: Source FileSet in format `workspace/fileset-name` +- `download[].dest`: Path relative to job storage defined by NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH diff --git a/services/automodel/src/nmp/automodel/tasks/docker/docker-compose.yaml b/services/automodel/src/nmp/automodel/tasks/docker/docker-compose.yaml new file mode 100644 index 00000000..eae2b4a1 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/docker/docker-compose.yaml @@ -0,0 +1,52 @@ +# Docker Compose for file_io task local testing +# +# Usage: +# # Start the task (runs once and exits) +# docker compose up +# +# # Run with custom command +# docker compose run --rm file-io run task --task nmp.customizer.tasks.file_io +# +# Prerequisites: +# - Build the image first (from Platform repo root): +# docker buildx bake -f docker-bake.automodel.hcl nmp-automodel-tasks-docker +# - Have NeMo Platform running at http://localhost:8080 +# - Create sample_config.json (or use the one provided) + +services: + file-io: + image: ${FILE_IO_IMAGE:-nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-tasks:local} + container_name: file-io-task + + # Mount config file and storage directory + # Using test data from services/customizer/tests/tasks/file_io/data/ + # files will be downloaded under services/customizer/tests/tasks/file_io/data/temp which is in .gitignore + volumes: + - ../../../../../../tests/tasks/file_io/data:/var/run/scratch + + environment: + # NeMo Platform URLs - use host.docker.internal to reach host services + NMP_BASE_URL: ${NMP_BASE_URL:-http://host.docker.internal:8000} + NMP_FILES_URL: ${NMP_FILES_URL:-http://host.docker.internal:8000} + NMP_JOBS_URL: ${NMP_JOBS_URL:-http://host.docker.internal:8000} + + # Job configuration paths (container paths) + NEMO_JOB_STEP_CONFIG_FILE_PATH: /var/run/scratch/sample_config.json + NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH: /var/run/scratch + + # Job metadata + NEMO_JOB_ID: ${NEMO_JOB_ID:-multi-file-job} + NEMO_JOB_STEP: ${NEMO_JOB_STEP:-FileIO} + NEMO_JOB_TASK: ${NEMO_JOB_TASK:-file-io-task} + NEMO_JOB_WORKSPACE: ${NEMO_JOB_WORKSPACE:-default} + + # Logging + LOG_LEVEL: ${LOG_LEVEL:-INFO} + + # Default command - run the file_io task + # Note: ENTRYPOINT is already "nemo-platform", so command should NOT include it + command: ["run", "task", "--task", "nmp.automodel.tasks.file_io"] + + # For macOS/Windows Docker Desktop - allows reaching host services + extra_hosts: + - "host.docker.internal:host-gateway" diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/__init__.py b/services/automodel/src/nmp/automodel/tasks/file_io/__init__.py new file mode 100644 index 00000000..8df0acb9 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/file_io/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""File I/O task for Automodel customization jobs.""" + +from nmp.automodel.tasks.file_io.run import run + +__all__ = ["run"] diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/__main__.py b/services/automodel/src/nmp/automodel/tasks/file_io/__main__.py new file mode 100644 index 00000000..68981865 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/file_io/__main__.py @@ -0,0 +1,9 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import sys + +from nmp.automodel.tasks.file_io.run import run + +if __name__ == "__main__": + sys.exit(run()) diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/callbacks.py b/services/automodel/src/nmp/automodel/tasks/file_io/callbacks.py new file mode 100644 index 00000000..51a2a180 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/file_io/callbacks.py @@ -0,0 +1,783 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Custom fsspec callbacks for progress reporting during file I/O operations.""" + +import logging +import os +import threading +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from fsspec.callbacks import Callback, TqdmCallback +from nmp.automodel.app.jobs.file_io.schemas import DownloadStats, TaskPhase, UploadStats +from nmp.automodel.tasks.file_io.progress_reporter import ProgressReporter +from nmp.common.jobs.schemas import PlatformJobStatus + +logger = logging.getLogger(__name__) + + +def get_percentage(current: int, total: int) -> int: + """Get percentage of current / total. + + Args: + current: The current value (numerator). + total: The total value (denominator). + + Returns: + Integer percentage from 0-100. + + Raises: + ValueError: If current > total, or if either value is negative. + + """ + if current > total: + raise ValueError( + f"Unexpected value of the current and total values: current={current} cannot be greater than total={total}", + ) + if total < 0: + raise ValueError(f"Unexpected negative value of the total value: total={total}, current={current}") + if current < 0: + raise ValueError(f"Unexpected negative value of the current value: current={current}, total={total}") + + if total == 0: + return 0 + return int((current / total) * 100) + + +@dataclass +class FileInfo: + """A dataclass for file information.""" + + path: str + size: int + + +class TqdmPerFileUploadCallback(Callback): + """A callback that creates a separate tqdm progress bar for each file. + + Unlike TqdmCallback which shows overall progress, this callback creates a new + tqdm progress bar when branched() is called for each file. Each file's progress + bar shows byte-level progress for that individual file. + + Usage: + callback = TqdmPerFileUploadCallback() + filesystem_sdk.put(src, dest, recursive=True, callback=callback) + # Creates a separate progress bar for each file being uploaded + """ + + def __init__(self, src_path: Path, **kwargs: Any): + """Initialize the per-file tqdm callback. + + Args: + **kwargs: Additional arguments passed to the base Callback. + + """ + self.src_path = src_path + super().__init__(**kwargs) + + def branched(self, full_src_path: str, full_dest_path: str, **kwargs: Any) -> TqdmCallback: + """Create a TqdmCallback for this specific file transfer. + + Args: + full_src_path: Source file path. + full_dest_path: Destination file path. + **kwargs: Additional keyword arguments. + + Returns: + A TqdmCallback configured for byte-level progress of this file. + + """ + # Extract just the filename for the progress bar description + if self.src_path.is_file(): + relative_path_upload_dir = self.src_path.name + else: + relative_path_upload_dir = Path(full_src_path).relative_to(self.src_path) + return TqdmCallback( + # https://tqdm.github.io/docs/tqdm + tqdm_kwargs={ + "desc": f"Uploading {relative_path_upload_dir!s}", + # use bytes as the unit + "unit": "B", + # scale the unit to be more readable (e.g. 1024 bytes = 1 KB) + "unit_scale": True, + # divide the unit by 1024 to get the next unit + "unit_divisor": 1024, + # The minimum number of iterations (bytes processed) that must occur before the progress bar refreshes + "miniters": 1, + }, + ) + + +class TqdmPerFileDownloadCallback(Callback): + """A callback that creates a separate tqdm progress bar for each file download. + + Similar to TqdmPerFileUploadCallback but for download operations. Creates a new + tqdm progress bar when branched() is called for each file being downloaded. + + The callback accepts a file_sizes dict to set the total size for each file's + progress bar. This is necessary because the SDK may not receive Content-Length + headers for streaming downloads (e.g., when chunked transfer encoding is used). + + Usage: + # Build file_sizes from listing + files = list_fileset_files(fileset) + file_sizes = {f.path.lstrip("/"): f.size for f in files} + + callback = TqdmPerFileDownloadCallback( + dest_path=dest_dir, + fileset_path="workspace/fileset", + file_sizes=file_sizes, + ) + filesystem_sdk.get(src, dest, recursive=True, callback=callback) + # Creates a separate progress bar for each file being downloaded + """ + + def __init__(self, dest_path: Path, fileset_path: str, file_sizes: dict[str, int] | None = None, **kwargs: Any): + """Initialize the per-file tqdm download callback. + + Args: + dest_path: The local destination directory path. + fileset_path: The fileset path (e.g., "workspace/fileset") used to extract + relative file paths from full source paths. + file_sizes: Optional dict mapping relative file paths to their sizes in bytes. + Used to set the progress bar's total for percentage display. + **kwargs: Additional arguments passed to the base Callback. + + """ + self.dest_path = dest_path + self.fileset_path = fileset_path.rstrip("/") + self.file_sizes = file_sizes or {} + super().__init__(**kwargs) + + def branched(self, full_src_path: str, full_dest_path: str, **kwargs: Any) -> TqdmCallback: + """Create a TqdmCallback for this specific file download. + + Args: + full_src_path: Source file path in the fileset (e.g., "workspace/fileset/dir/file.txt"). + full_dest_path: Destination local file path. + **kwargs: Additional keyword arguments. + + Returns: + A TqdmCallback configured for byte-level progress of this file. + + """ + # Extract relative path for the progress bar description + # full_dest_path is the full local path, we want to show just the filename or relative path + dest_full_path = Path(full_dest_path) + if self.dest_path.is_file(): + relative_path = dest_full_path.name + else: + try: + relative_path = dest_full_path.relative_to(self.dest_path) + except ValueError: + # If can't compute relative path, use filename + relative_path = dest_full_path.name + + # Extract relative file path from full source path to look up size + # full_src_path format: "workspace/fileset/relative/path/to/file.txt" + # We need to extract "relative/path/to/file.txt" + relative_file_path = full_src_path + if full_src_path.startswith(self.fileset_path): + relative_file_path = full_src_path[len(self.fileset_path) :].lstrip("/") + + # Look up file size from pre-computed mapping + file_size = self.file_sizes.get(relative_file_path) + + callback = TqdmCallback( + tqdm_kwargs={ + "desc": f"Downloading {relative_path!s}", + "unit": "B", + "unit_scale": True, + "unit_divisor": 1024, + "miniters": 1, + }, + ) + + # Set size if we know it - this enables percentage display in tqdm + # Must be called via set_size() rather than tqdm_kwargs["total"] because + # the SDK may also call set_size() from Content-Length header + if file_size is not None: + callback.set_size(file_size) + + return callback + + +class BaseProgressCallback(Callback): + """Base class for file upload/download progress callbacks. + + This abstract base class provides common functionality for tracking file transfer + progress and reporting to the Jobs service. Subclasses implement operation-specific + behavior (upload vs download). + + Thread Safety: + This callback uses threading.Lock for synchronization. FilesetFileSystem is + async-first and transfers files concurrently. The lock protects against + concurrent access when multiple files complete simultaneously. + + Attributes: + progress_reporter: The progress reporter for sending updates to Jobs service. + fileset_name: The name of the fileset (workspace/name format). + total_files: Total number of files to transfer. + total_size: Total size of all files in bytes. + stats: Mutable stats object to track progress (UploadStats or DownloadStats). + _lock: Threading lock for thread-safe stats updates. + + """ + + progress_reporter: ProgressReporter + fileset_name: str + total_files: int + total_size: int + stats: UploadStats | DownloadStats + _lock: threading.Lock + + def __init__( + self, + progress_reporter: ProgressReporter, + fileset_name: str, + total_files: int, + total_size: int, + stats: UploadStats | DownloadStats, + **kwargs: Any, + ): + """Initialize the progress callback. + + Args: + progress_reporter: The progress reporter for sending updates to Jobs service. + fileset_name: The name of the fileset (workspace/name format). + total_files: Total number of files to transfer. + total_size: Total size of all files in bytes. + stats: Mutable stats object to track progress. + **kwargs: Additional arguments passed to the base Callback. + + """ + super().__init__(**kwargs) + self.progress_reporter = progress_reporter + self.fileset_name = str(fileset_name) + self.total_files = total_files + self.total_size = total_size + self.stats = stats + self._lock = threading.Lock() + + @staticmethod + def list_local_files(src_path: Path) -> list[FileInfo]: + """List all files from a local path (file or directory). + + If src_path is a file, returns a single FileInfo with the filename. + If src_path is a directory, recursively lists all files. + + Returns list of FileInfo objects with 'path' (relative path) and 'size' keys. + This mirrors the format returned by list_fileset_files. + """ + if not src_path.exists(): + logger.warning(f"Failed to list local files. Source path does not exist: {src_path}") + return [] + + try: + # Handle single file + if src_path.is_file(): + logger.info(f"Found 1 file: {src_path.name}") + return [ + FileInfo( + path=src_path.name, + size=src_path.stat().st_size, + ), + ] + + # Handle directory + files = [] + for root, _, filenames in os.walk(src_path): + for filename in filenames: + full_path = Path(root) / filename + relative_path = full_path.relative_to(src_path) + files.append( + FileInfo( + path=str(relative_path), + size=full_path.stat().st_size, + ), + ) + logger.info(f"Found {len(files)} files in {src_path}") + return files + except Exception as e: + logger.warning(f"Failed to list local files. Source path: {src_path}. Error: {e}") + return [] + + @abstractmethod + def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "BaseSingleFileCallback": + """Create a child callback for a single file transfer. + + Args: + source_path: Source file path. + dest_path: Destination file path. + **kwargs: Additional keyword arguments. + + Returns: + A BaseSingleFileCallback subclass for tracking this file's transfer. + + """ + ... + + +class BaseSingleFileCallback(Callback): + """Base class for single file upload/download callbacks. + + This abstract base class provides common functionality for tracking individual + file transfers within a batch operation. Subclasses implement operation-specific + behavior via the template method pattern. + + The close() method uses the template method pattern, calling abstract methods + that subclasses override to provide operation-specific behavior: + - _get_phase(): Returns the TaskPhase for this operation + - _get_file_display_path(): Returns the path to display for logging + - _update_stats(): Updates the parent's stats for this operation + - _build_status_details(): Builds the status_details dict for progress reporting + """ + + parent: BaseProgressCallback + source_path: str + dest_path: str + _completed: bool + + def __init__( + self, + parent: BaseProgressCallback, + source_path: str, + dest_path: str, + **kwargs: Any, + ): + """Initialize the single file callback. + + Args: + parent: The parent progress callback. + source_path: Path to the source file. + dest_path: Destination path for the file. + **kwargs: Additional arguments passed to the base Callback. + + """ + super().__init__(**kwargs) + self.parent = parent + self.source_path = source_path + self.dest_path = dest_path + self._completed = False + + @abstractmethod + def _get_phase(self) -> str: + """Return the TaskPhase for this operation.""" + ... + + @abstractmethod + def _get_file_display_path(self) -> str: + """Return the path to use for display/logging.""" + ... + + @abstractmethod + def _update_stats(self) -> None: + """Update the parent's stats for this operation (called within lock).""" + ... + + @abstractmethod + def _get_files_count(self) -> int: + """Return the current files count from stats (called within lock).""" + ... + + @abstractmethod + def _build_status_details(self, files_count: int, total_bytes: int, current_file: str) -> dict[str, Any]: + """Build the status_details dict for progress reporting. + + Args: + files_count: Number of files transferred so far. + total_bytes: Total bytes transferred so far. + current_file: Name of the current file for display. + + Returns: + Dictionary with status details for the progress report. + + """ + ... + + def close(self) -> None: + """Called when the file transfer completes. + + Updates the parent's statistics and reports progress to the Jobs service. + Thread-safe: uses parent's lock to protect stats updates. + """ + if self._completed: + return + + self._completed = True + parent = self.parent + + # Extract the filename for logging/display + current_file = self._get_file_display_path() + + # Thread-safe stats update + with parent._lock: + # Update stats (operation-specific) + self._update_stats() + + # Capture current values while holding the lock + files_count = self._get_files_count() + total_bytes = parent.stats.total_bytes + + logger.debug(f"File transferred: {current_file} ({files_count}/{parent.total_files})") + + # Report progress to Jobs service (outside lock to avoid holding it during I/O) + parent.progress_reporter.update_progress( + status=PlatformJobStatus.ACTIVE, + status_details=self._build_status_details(files_count, total_bytes, current_file), + ) + + def __enter__(self) -> "BaseSingleFileCallback": + return self + + def __exit__(self, *exc_args: object) -> None: + self.close() + + +class FileUploadProgressCallback(BaseProgressCallback): + """Callback for tracking file upload progress and reporting to the Jobs service. + + This callback integrates with fsspec's callback mechanism to report progress + after each file is uploaded. It uses the branched callback pattern where: + - The parent callback tracks overall upload statistics + - Child callbacks are created for each file via `branched()` + - When a child callback closes, it signals file completion to the parent + + Usage: + callback = FileUploadProgressCallback( + progress_reporter=reporter, + src_path=src_path, + fileset_name="workspace/fileset", + stats=upload_stats, + ) + filesystem_sdk.put(src, dest, recursive=True, callback=callback) + """ + + stats: UploadStats + + def __init__( + self, + progress_reporter: ProgressReporter, + src_path: Path, + fileset_name: str, + stats: UploadStats, + **kwargs: Any, + ): + """Initialize the upload progress callback. + + Args: + progress_reporter: The progress reporter for sending updates to Jobs service. + src_path: The source path (file or directory) to upload. + fileset_name: The name of the target fileset (workspace/name format). + stats: Mutable UploadStats object to track progress. + **kwargs: Additional arguments passed to the base Callback. + + """ + # List files to get stats before upload + files = self.list_local_files(src_path) + + if not files: + logger.warning(f"Source path {src_path} contains no files") + + total_files = len(files) + total_size = sum(f.size for f in files) + + # Initialize base class with computed values + super().__init__( + progress_reporter=progress_reporter, + fileset_name=fileset_name, + total_files=total_files, + total_size=total_size, + stats=stats, + **kwargs, + ) + + logger.info(f"Uploading {total_files} files ({total_size} bytes) to {self.fileset_name}") + + # Report initial progress + progress_reporter.update_progress( + status=PlatformJobStatus.ACTIVE, + status_details={ + "phase": TaskPhase.UPLOADING, + "fileset": self.fileset_name, + "total_files": total_files, + "total_size": total_size, + "uploaded_files": 0, + "uploaded_bytes": 0, + }, + ) + + def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "SingleFileUploadCallback": + """Create a child callback for a single file upload. + + This method is called by fsspec when starting a file transfer within + a recursive put operation. It returns a child callback that tracks + the individual file's progress and reports completion to the parent. + + Args: + source_path: Source file path. + path_2: Destination file path. + **kwargs: Additional keyword arguments. + + Returns: + A SingleFileUploadCallback for tracking this file's upload. + + """ + return SingleFileUploadCallback( + parent=self, + source_path=source_path, + dest_path=dest_path, + **kwargs, + ) + + +class SingleFileUploadCallback(BaseSingleFileCallback): + """Callback for tracking a single file upload within a batch operation. + + This child callback is created by FileUploadProgressCallback.branched() + for each file being uploaded. When the upload completes and this callback + is closed, it notifies the parent to update overall progress. + """ + + parent: FileUploadProgressCallback + + def _get_phase(self) -> str: + """Return the TaskPhase for upload operations.""" + return TaskPhase.UPLOADING + + def _get_file_display_path(self) -> str: + """Return the destination filename for display.""" + return self.dest_path.split("/")[-1] if "/" in self.dest_path else self.dest_path + + def _update_stats(self) -> None: + """Update the parent's upload stats.""" + self.parent.stats.files_uploaded += 1 + if self.size is not None: + self.parent.stats.total_bytes += self.size + + def _get_files_count(self) -> int: + """Return the current uploaded files count.""" + return self.parent.stats.files_uploaded + + def _build_status_details(self, files_count: int, total_bytes: int, current_file: str) -> dict[str, Any]: + """Build the status_details dict for upload progress reporting.""" + return { + "phase": TaskPhase.UPLOADING, + "fileset": self.parent.fileset_name, + "total_files": self.parent.total_files, + "total_size": self.parent.total_size, + "uploaded_files": files_count, + "uploaded_bytes": total_bytes, + "current_file": current_file, + "progress_pct": get_percentage(files_count, self.parent.total_files), + } + + +class FileDownloadProgressCallback(BaseProgressCallback): + """Callback for tracking file download progress and reporting to the Jobs service. + + Similar to FileUploadProgressCallback but for download operations. + + Usage: + callback = FileDownloadProgressCallback( + progress_reporter=reporter, + fileset_name="workspace/fileset", + total_files=10, + total_size=1024000, + stats=download_stats, + ) + filesystem_sdk.get(src, dest, recursive=True, callback=callback) + """ + + stats: DownloadStats + + def __init__( + self, + progress_reporter: ProgressReporter, + fileset_name: str, + total_files: int, + total_size: int, + stats: DownloadStats, + **kwargs: Any, + ): + """Initialize the download progress callback. + + Args: + progress_reporter: The progress reporter for sending updates to Jobs service. + fileset_name: The name of the source fileset (workspace/name format). + total_files: Total number of files to download. + total_size: Total size of all files in bytes. + stats: Mutable DownloadStats object to track progress. + **kwargs: Additional arguments passed to the base Callback. + + """ + super().__init__( + progress_reporter=progress_reporter, + fileset_name=fileset_name, + total_files=total_files, + total_size=total_size, + stats=stats, + **kwargs, + ) + + logger.info(f"Downloading {total_files} files ({total_size} bytes) from {self.fileset_name}") + + # Report initial progress + progress_reporter.update_progress( + status=PlatformJobStatus.ACTIVE, + status_details={ + "phase": TaskPhase.DOWNLOADING, + "fileset": self.fileset_name, + "total_files": total_files, + "total_size": total_size, + "downloaded_files": 0, + "downloaded_bytes": 0, + }, + ) + + def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "SingleFileDownloadCallback": + """Create a child callback for a single file download. + + Args: + source_path: Source file path in the fileset. + dest_path: Destination local file path. + **kwargs: Additional keyword arguments. + + Returns: + A SingleFileDownloadCallback for tracking this file's download. + + """ + return SingleFileDownloadCallback( + parent=self, + source_path=source_path, + dest_path=dest_path, + **kwargs, + ) + + +class SingleFileDownloadCallback(BaseSingleFileCallback): + """Callback for tracking a single file download within a batch operation. + + This child callback is created by FileDownloadProgressCallback.branched() + for each file being downloaded. When the download completes and this callback + is closed, it notifies the parent to update overall progress. + """ + + parent: FileDownloadProgressCallback + + def _get_phase(self) -> str: + """Return the TaskPhase for download operations.""" + return TaskPhase.DOWNLOADING + + def _get_file_display_path(self) -> str: + """Return the source filename for display.""" + return self.source_path.split("/")[-1] if "/" in self.source_path else self.source_path + + def _update_stats(self) -> None: + """Update the parent's download stats.""" + self.parent.stats.files_downloaded += 1 + if self.size is not None: + self.parent.stats.total_bytes += self.size + + def _get_files_count(self) -> int: + """Return the current downloaded files count.""" + return self.parent.stats.files_downloaded + + def _build_status_details(self, files_count: int, total_bytes: int, current_file: str) -> dict[str, Any]: + """Build the status_details dict for download progress reporting.""" + return { + "phase": TaskPhase.DOWNLOADING, + "fileset": self.parent.fileset_name, + "total_files": self.parent.total_files, + "total_size": self.parent.total_size, + "downloaded_files": files_count, + "downloaded_bytes": total_bytes, + "current_file": current_file, + "progress_pct": get_percentage(files_count, self.parent.total_files), + } + + +class CompositeCallback(Callback): + """A callback that delegates to multiple child callbacks. + + This allows combining multiple callbacks (e.g., TqdmCallback for console progress + and FileUploadProgressCallback for Jobs service reporting) into a single callback + that can be passed to fsspec operations. + + All callback methods are forwarded to each child callback in order. + + Usage: + tqdm_cb = TqdmCallback(tqdm_kwargs={"desc": "Uploading"}) + progress_cb = FileUploadProgressCallback(...) + composite = CompositeCallback(tqdm_cb, progress_cb) + filesystem_sdk.put(src, dest, recursive=True, callback=composite) + """ + + def __init__(self, *callbacks: Callback, **kwargs: Any): + """Initialize with multiple callbacks. + + Args: + *callbacks: Variable number of Callback instances to delegate to. + **kwargs: Additional arguments passed to the base Callback. + + """ + super().__init__(**kwargs) + self.callbacks = list(callbacks) + + def set_size(self, size: int) -> None: + """Set size on all child callbacks.""" + self.size = size + for cb in self.callbacks: + cb.set_size(size) + + def absolute_update(self, value: int) -> None: + """Update absolute value on all child callbacks.""" + self.value = value + for cb in self.callbacks: + cb.absolute_update(value) + + def relative_update(self, inc: int = 1) -> None: + """Update relative value on all child callbacks.""" + self.value += inc + for cb in self.callbacks: + cb.relative_update(inc) + + def branched(self, source_path: str, dest_path: str, **kwargs: Any) -> "CompositeCallback": + """Create a composite child callback from all child callbacks' branched results. + + Each child callback's branched() method is called, and the results are + wrapped in a new CompositeCallback. + + Args: + source_path: Source path. + dest_path: Destination path. + **kwargs: Additional keyword arguments. + + Returns: + A new CompositeCallback wrapping all child callbacks' branched results. + + """ + child_callbacks = [cb.branched(source_path, dest_path, **kwargs) for cb in self.callbacks] + return CompositeCallback(*child_callbacks) + + def call(self, hook_name: str | None = None, **kwargs: Any) -> None: + """Call hooks on all child callbacks.""" + for cb in self.callbacks: + cb.call(hook_name, **kwargs) + + def close(self) -> None: + """Close all child callbacks.""" + for cb in self.callbacks: + cb.close() + + def __enter__(self) -> "CompositeCallback": + for cb in self.callbacks: + cb.__enter__() + return self + + def __exit__(self, *exc_args: object) -> None: + for cb in self.callbacks: + cb.__exit__(*exc_args) diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/progress_reporter.py b/services/automodel/src/nmp/automodel/tasks/file_io/progress_reporter.py new file mode 100644 index 00000000..00fa6601 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/file_io/progress_reporter.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +from typing import Any, Protocol + +from nemo_platform import NeMoPlatform, omit +from nemo_platform._exceptions import APIError +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.app.jobs.file_io.schemas import ProgressReportError +from nmp.automodel.tasks.file_io.utils import sdk_error_handler +from nmp.common.jobs.schemas import PlatformJobStatus + +logger = logging.getLogger(__name__) + + +class ProgressReporter(Protocol): + """Interface for reporting task progress.""" + + def update_progress( + self, + status: PlatformJobStatus, + status_details: dict[str, Any] | None = None, + error_details: dict[str, Any] | None = None, + error_stack: str | None = None, + ) -> None: + """Update task progress.""" + ... + + +class NoOpProgressReporter: + """Progress reporter that does nothing. Used when Jobs service is not configured.""" + + def update_progress( + self, + status: PlatformJobStatus, + status_details: dict[str, Any] | None = None, + error_details: dict[str, Any] | None = None, + error_stack: str | None = None, + ) -> None: + """No-op: silently ignore progress updates.""" + + +class JobsServiceProgressReporter: + """Reports progress to the Jobs service via SDK.""" + + def __init__(self, sdk: NeMoPlatform, workspace: str, job_id: str, step_name: str, task_id: str): + self.sdk = sdk + self.workspace = workspace + self.job_id = job_id + self.step_name = step_name + self.task_id = task_id + + def update_progress( + self, + status: PlatformJobStatus, + status_details: dict[str, object] | None = None, + error_details: dict[str, object] | None = None, + error_stack: str | None = None, + ) -> None: + """Update task progress via SDK.""" + try: + with sdk_error_handler( + ProgressReportError, + f"update progress for task: {self.task_id}, job: {self.job_id}, step: {self.step_name}", + passthrough=(APIError,), + ): + self.sdk.jobs.tasks.create_or_update( + self.task_id, + workspace=self.workspace, + job=self.job_id, + step=self.step_name, + status=status.value, + status_details=status_details if status_details else omit, + error_details=error_details if error_details else omit, + error_stack=error_stack if error_stack else omit, + ) + logger.debug(f"Progress updated: {status} - {status_details}") + except Exception as e: + logger.warning( + f"Failed to report progress for task {self.task_id}, job {self.job_id}, step {self.step_name}: {e}", + ) + + @staticmethod + def create_progress_reporter(sdk: NeMoPlatform, job_ctx: NMPJobContext) -> ProgressReporter: + """Create JobsServiceProgressReporter when jobs_url is set, else NoOpProgressReporter.""" + if job_ctx.jobs_url: + logger.info(f"Progress reporting enabled: {job_ctx.jobs_url}") + return JobsServiceProgressReporter( + sdk, job_ctx.workspace, job_ctx.job_id, job_ctx.step, job_ctx.normalized_task + ) + logger.info("Progress reporting disabled: jobs_url not configured") + return NoOpProgressReporter() diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/run.py b/services/automodel/src/nmp/automodel/tasks/file_io/run.py new file mode 100644 index 00000000..f14acd7b --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/file_io/run.py @@ -0,0 +1,560 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""File I/O task entry point. + +Handles file operations between NeMo Platform Files Service and the job's shared PVC. + +The task reads configuration and performs: +- Downloads: If config.download is non-empty, download files from FileSets to local paths +- Uploads: If config.upload is non-empty, upload files from local paths to FileSets + +Usage: + export NEMO_JOB_STEP_CONFIG_FILE_PATH= + python -m nmp.automodel.tasks.file_io +""" + +import logging +from pathlib import Path + +import httpx + +# https://docs.nvidia.com/nemo/microservices/latest/pysdk/index.html#handling-errors +from nemo_platform import ( + APIConnectionError, + APITimeoutError, + ConflictError, + InternalServerError, + NeMoPlatform, + NotFoundError, +) +from nemo_platform.types.files.fileset_file import FilesetFile +from nmp.automodel.app.constants import SERVICE_NAME +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.app.jobs.file_io.schemas import ( + DownloadItem, + DownloadStats, + FileDownloadError, + FileSetRef, + FileUploadError, + PathTraversalError, + TaskPhase, + UploadItem, + UploadStats, +) +from nmp.automodel.tasks.file_io.callbacks import ( + CompositeCallback, + FileDownloadProgressCallback, + FileUploadProgressCallback, + TqdmPerFileDownloadCallback, + TqdmPerFileUploadCallback, +) +from nmp.automodel.tasks.file_io.progress_reporter import JobsServiceProgressReporter, ProgressReporter +from nmp.automodel.tasks.file_io.utils import ( + filesystem_sdk_error_handler, + get_config, + sdk_error_handler, + validate_safe_path, + validate_storage_path, +) +from nmp.common.jobs.schemas import PlatformJobStatus +from nmp.common.sdk_factory import get_task_sdk +from tenacity import before_sleep_log, retry, retry_if_exception_type, stop_after_attempt, wait_exponential + +logger = logging.getLogger(__name__) + +# Timeout configurations for SDK operations (httpx.Timeout for API calls) +CREATE_FILESET_TIMEOUT = httpx.Timeout(10.0, connect=10.0) +LIST_FILES_TIMEOUT = httpx.Timeout(10.0, connect=10.0) + +# Timeout configurations for FilesetFileSystem operations. +# These are passed via sdk.with_options(timeout=...) and control the httpx client. +# httpx.Timeout(read=...) is the max wait for a single chunk (16MB by default), NOT total transfer time. +# nemo-platform/src/nemo_platform/filesets/filesystem/filesystem.py > blocksize = 16 * 1024 * 1024 # 16MB +# It's a socket-level timeout. Each individual socket read has its own timeout window. +# SDK defaults httpx.Timeout(timeout=60, connect=5.0) nemo-platform/src/nemo_platform/_constants.py +DOWNLOAD_TIMEOUT = httpx.Timeout(30.0, read=5 * 60) # 30s connect/pool, 5min per-chunk read +UPLOAD_TIMEOUT = httpx.Timeout(30.0, write=10 * 60, read=5 * 60) # 30s connect/pool, 10min write, 5min read + +# Retry configuration +MAX_RETRIES = 3 +INITIAL_BACKOFF_SECONDS = 1.0 +MAX_BACKOFF_SECONDS = 30.0 + +# Transient exceptions that should trigger retries for filesystem operations. +# FilesetFileSystem uses httpx under the hood, so we retry on httpx transient errors +# in addition to SDK-level transient errors. +TRANSIENT_FILESYSTEM_EXCEPTIONS = ( + httpx.TimeoutException, + httpx.ConnectError, + httpx.ReadTimeout, +) + + +class FileIORunner: + def __init__( + self, + sdk: NeMoPlatform, + progress_reporter: ProgressReporter, + job_ctx: NMPJobContext, + ): + self.sdk = sdk + self.progress_reporter = progress_reporter + self.job_ctx = job_ctx + + def list_fileset_files( + self, + fileset: FileSetRef, + ) -> list[FilesetFile]: + """List files in a FileSet. + + Returns list of file info dicts with 'path' and 'size' keys. + """ + try: + with sdk_error_handler(FileDownloadError, f"list files in fileset {fileset}", passthrough=(NotFoundError,)): + response = self.sdk.with_options(timeout=LIST_FILES_TIMEOUT).files.list( + fileset=fileset.name, + workspace=fileset.workspace, + ) + logger.info(f"Found {len(response.data)} files in FileSet {fileset!s}") + return response.data + except NotFoundError as e: + raise FileDownloadError( + f"FileSet {fileset!s} not found. Please ensure the FileSet exists and contains the expected files.", + ) from e + + def download_fileset( + self, + fileset: FileSetRef, + dest_dir: Path, + ) -> DownloadStats: + """Download all files from a FileSet to a destination directory. + + Uses FilesetFileSystem.get() with recursive=True for efficient batch downloads. + Progress is tracked via two callbacks combined in a CompositeCallback: + - TqdmPerFileDownloadCallback: Creates a separate console progress bar per file (shows bytes) + - FileDownloadProgressCallback: Reports progress to Jobs service after each file + + Args: + fileset: The source FileSet reference. + dest_dir: The destination directory path. + + Returns: + DownloadStats with files_downloaded, total_bytes, and failed_files counts. + + Raises: + FileDownloadError: If the download fails. + + """ + stats = DownloadStats() + fileset_name = str(fileset) + + # List files in the fileset to get total count and size + files = self.list_fileset_files(fileset) + + if not files: + logger.warning(f"FileSet {fileset_name} contains no files") + return stats + + total_files = len(files) + total_size = sum(f.size for f in files) + + # Ensure destination directory exists + dest_dir.mkdir(parents=True, exist_ok=True) + + # Build file sizes mapping for progress bar display + # Maps relative file paths to their sizes in bytes + file_sizes = {f.path.lstrip("/"): f.size for f in files} + + # Create callbacks: + # 1. TqdmPerFileDownloadCallback for console progress - creates a separate progress bar per file + tqdm_callback = TqdmPerFileDownloadCallback( + dest_path=dest_dir, + fileset_path=fileset_name, + file_sizes=file_sizes, + ) + + # 2. FileDownloadProgressCallback for Jobs service reporting + jobs_callback = FileDownloadProgressCallback( + progress_reporter=self.progress_reporter, + fileset_name=fileset_name, + total_files=total_files, + total_size=total_size, + stats=stats, + ) + + # Combine both callbacks into a composite that delegates to both + composite_callback = CompositeCallback(tqdm_callback, jobs_callback) + + with filesystem_sdk_error_handler( + FileDownloadError, + f"download from '{fileset_name}' to '{dest_dir}'", + ): + self._download_with_retry( + fileset_name=fileset.name, + fileset_workspace=fileset.workspace, + dest_dir=str(dest_dir), + callback=composite_callback, + ) + + logger.info(f"Download complete: {stats.files_downloaded} files, {stats.total_bytes} bytes") + return stats + + @retry( + stop=stop_after_attempt(MAX_RETRIES), + wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), + retry=retry_if_exception_type(TRANSIENT_FILESYSTEM_EXCEPTIONS), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _download_with_retry( + self, + fileset_name: str, + fileset_workspace: str | None, + dest_dir: str, + callback: CompositeCallback, + ) -> None: + """Internal method with retry logic for downloading from FilesetFileSystem.""" + self.sdk.with_options(timeout=DOWNLOAD_TIMEOUT).files.download( + fileset=fileset_name, + workspace=fileset_workspace, + local_path=dest_dir, + callback=callback, # type: ignore[arg-type] + ) + + def upload_fileset( + self, + fileset: FileSetRef, + src_path: Path, + ) -> UploadStats: + """Upload all files from a source path (file or directory) to a FileSet. + + Uses FilesetFileSystem.put() with recursive=True for efficient batch uploads. + Progress is tracked via two callbacks combined in a CompositeCallback: + - TqdmPerFileCallback: Creates a separate console progress bar per file (shows bytes) + - FileUploadProgressCallback: Reports progress to Jobs service after each file + + Args: + fileset: The target FileSet reference. + src_path: The source path, can be a single file or a directory. + progress_reporter: Progress reporter for status updates. + + Returns: + UploadStats with files_uploaded, total_bytes, and failed_files counts. + + Raises: + FileUploadError: If the upload fails. + + """ + stats = UploadStats() + fileset_name = str(fileset) + + # Create callbacks: + # 1. TqdmPerFileCallback for console progress - creates a separate progress bar per file + tqdm_callback = TqdmPerFileUploadCallback(src_path=src_path) + + # 2. FileUploadProgressCallback for Jobs service reporting + jobs_callback = FileUploadProgressCallback( + progress_reporter=self.progress_reporter, + src_path=src_path, + fileset_name=fileset_name, + stats=stats, + ) + + # Combine both callbacks into a composite that delegates to both + composite_callback = CompositeCallback(tqdm_callback, jobs_callback) + + # Build local and remote paths for upload + # remote_path is relative within the fileset (e.g., "" for root, "filename" for single file) + if src_path.is_dir(): + # Add trailing slash to source to copy directory CONTENTS (not the directory itself) + # This follows rsync/scp convention: "dir/" copies contents, "dir" copies the directory + local_path = f"{src_path}/" + remote_path = "" # Upload to fileset root + else: + # Single file: upload to fileset root with same filename + local_path = str(src_path) + remote_path = src_path.name + + with filesystem_sdk_error_handler( + FileUploadError, + f"upload from '{src_path}' to '{fileset_name}'", + ): + self._upload_with_retry( + local_path=local_path, + remote_path=remote_path, + fileset_name=fileset.name, + fileset_workspace=fileset.workspace, + callback=composite_callback, + ) + + logger.info(f"Upload complete: {stats.files_uploaded} files, {stats.total_bytes} bytes") + return stats + + @retry( + stop=stop_after_attempt(MAX_RETRIES), + wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), + retry=retry_if_exception_type(TRANSIENT_FILESYSTEM_EXCEPTIONS), + reraise=True, + before_sleep=before_sleep_log(logger, logging.WARNING, exc_info=True), + ) + def _upload_with_retry( + self, + local_path: str, + remote_path: str, + fileset_name: str, + fileset_workspace: str | None, + callback: CompositeCallback, + ) -> None: + """Internal method with retry logic for uploading to FilesetFileSystem.""" + self.sdk.with_options(timeout=UPLOAD_TIMEOUT).files.upload( + local_path=local_path, + remote_path=remote_path, + fileset=fileset_name, + workspace=fileset_workspace, + callback=callback, # type: ignore[arg-type] + ) + + def run_download(self, downloads: list[DownloadItem]) -> None: + """Execute download operations. + + Downloads files from FileSets to job storage based on downloads list. + """ + if not downloads: + logger.info("No downloads configured, skipping download operation") + return + + storage_path = validate_storage_path(self.job_ctx.storage_path) + + logger.info(f"Starting download operation: {len(downloads)} fileset(s) to download") + + # Report task started + self.progress_reporter.update_progress( + status=PlatformJobStatus.ACTIVE, + status_details={ + "phase": TaskPhase.DOWNLOADING, + "total_filesets": len(downloads), + "completed_filesets": 0, + }, + ) + + total_stats = DownloadStats() + + for idx, item in enumerate(downloads): + fileset = item.src + # Validate destination path to prevent path traversal attacks + dest_dir = validate_safe_path(storage_path, item.dest) + + logger.info(f"[{idx + 1}/{len(downloads)}] Downloading from {fileset!s} to {dest_dir}") + + self.progress_reporter.update_progress( + status=PlatformJobStatus.ACTIVE, + status_details={ + "phase": TaskPhase.DOWNLOADING, + "total_filesets": len(downloads), + "completed_filesets": idx, + "current_fileset": f"{fileset!s}", + }, + ) + + stats = self.download_fileset( + fileset, + dest_dir, + ) + total_stats.files_downloaded += stats.files_downloaded + total_stats.total_bytes += stats.total_bytes + + logger.info(f"FileSet download complete: {stats.files_downloaded} files, {stats.total_bytes} bytes") + + logger.info( + f"All downloads complete: {total_stats.files_downloaded} files, {total_stats.total_bytes} bytes total", + ) + + def create_fileset(self, fileset: FileSetRef, metadata: dict | None = None) -> None: + """Create a FileSet. Skip if it already exists. + + Uses retry logic for transient errors and converts SDK exceptions to FileUploadError. + """ + # sdk_error_handler wraps the retry to convert exceptions after all retries exhaust + with sdk_error_handler(FileUploadError, f"create fileset {fileset}", passthrough=(ConflictError,)): + self._create_fileset_with_retry(fileset, metadata) + + # we don't use sdk retry because it would retry on ConflictError which is expected and would be wasteful + @retry( + stop=stop_after_attempt(MAX_RETRIES), + wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), + retry=retry_if_exception_type((InternalServerError, APITimeoutError, APIConnectionError)), + reraise=True, # means that the last exception will be re-raised after the last retry attempt + ) + def _create_fileset_with_retry(self, fileset: FileSetRef, metadata: dict | None = None) -> None: + """Internal method with retry logic for creating a FileSet.""" + try: + create_kwargs: dict = { + "workspace": fileset.workspace, + "name": fileset.name, + "timeout": CREATE_FILESET_TIMEOUT, + "custom_fields": {"service_source": "automodel"}, + } + if metadata is not None: + create_kwargs["metadata"] = metadata + result = self.sdk.with_options(max_retries=0).files.filesets.create(**create_kwargs) + logger.info(f"Created FileSet: {result.workspace}/{result.name}") + except ConflictError: + # Fileset already exists - patch metadata so tool_calling etc. are not lost + workspace = fileset.workspace or self.job_ctx.workspace + if metadata is not None: + try: + self.sdk.with_options(max_retries=0).files.filesets.update( + name=fileset.name, + workspace=workspace, + metadata=metadata, + timeout=CREATE_FILESET_TIMEOUT, + ) + logger.info(f"Patched existing FileSet metadata: {workspace}/{fileset.name}") + except Exception as e: + logger.warning( + f"Could not patch metadata on existing fileset {workspace}/{fileset.name}: {e}. " + "Upload will continue; model-spec may lack tool_calling/chat_template from source." + ) + + def run_upload(self, uploads: list[UploadItem]) -> None: + """Execute upload operations. + + Uploads files from job storage to FileSets based on uploads list. + + Args: + uploads: List of upload items to process. + """ + if not uploads: + logger.info("No uploads configured, skipping upload operation") + return + + storage_path = validate_storage_path(self.job_ctx.storage_path) + + logger.info(f"Starting upload operation: {len(uploads)} fileset(s) to upload") + + # Report task started + self.progress_reporter.update_progress( + status=PlatformJobStatus.ACTIVE, + status_details={ + "phase": TaskPhase.UPLOADING, + "total_filesets": len(uploads), + "completed_filesets": 0, + }, + ) + + total_stats = UploadStats() + + for idx, item in enumerate(uploads): + if item.dest.workspace is None: + item.dest.workspace = self.job_ctx.workspace + fileset = item.dest + # Validate source path to prevent path traversal attacks + src_path = validate_safe_path(storage_path, item.src) + if not src_path.exists(): + raise FileUploadError(f"Source path does not exist: {src_path}. Ensure the source path exists.") + if not src_path.is_dir() and not src_path.is_file(): + raise FileUploadError( + f"Source path is not a file or directory: {src_path}. Ensure the source path is a file or directory.", + ) + + logger.info(f"[{idx + 1}/{len(uploads)}] Uploading from {src_path} to {fileset!s}") + + self.progress_reporter.update_progress( + status=PlatformJobStatus.ACTIVE, + status_details={ + "phase": TaskPhase.UPLOADING, + "total_filesets": len(uploads), + "completed_filesets": idx, + "current_fileset": str(fileset), + }, + ) + + self.create_fileset(fileset, metadata=item.metadata) + + stats = self.upload_fileset( + fileset, + src_path, + ) + total_stats.files_uploaded += stats.files_uploaded + total_stats.total_bytes += stats.total_bytes + + logger.info(f"FileSet upload complete: {stats.files_uploaded} files, {stats.total_bytes} bytes") + + logger.info(f"All uploads complete: {total_stats.files_uploaded} files, {total_stats.total_bytes} bytes total") + + +def run(sdk: NeMoPlatform | None = None, job_ctx: NMPJobContext | None = None) -> int: + """Execute the file I/O task. + + Processes downloads and uploads based on the configuration. + + Args: + sdk: Optional SDK instance for dependency injection (for testing). + If None, creates one via get_task_sdk(). + job_ctx: Optional job context for dependency injection (for testing). + If None, creates one via NMPJobContext.from_env(). + + Returns: + Exit code (0 for success, non-zero for failure). + + """ + job_ctx = job_ctx or NMPJobContext.from_env() + validate_storage_path(job_ctx.storage_path) + + sdk_owned = sdk is None + progress_reporter: ProgressReporter | None = None + try: + sdk = sdk or get_task_sdk(SERVICE_NAME) + # Initialize progress reporter (no-op if Jobs URL not configured) + progress_reporter = JobsServiceProgressReporter.create_progress_reporter(sdk, job_ctx) + runner = FileIORunner(sdk=sdk, progress_reporter=progress_reporter, job_ctx=job_ctx) + + config = get_config(job_ctx.config_path) + + logger.info(f"Starting file I/O task with job context: {job_ctx}") + logger.info(f"Config: {config.model_dump_json(indent=2)}") + logger.info(f"NeMo Platform service URL: {sdk.base_url}") + + # Execute uploads if configured + runner.run_upload(config.upload) + + # Execute downloads if configured + runner.run_download(config.download) + + # Report overall completion + progress_reporter.update_progress( + status=PlatformJobStatus.COMPLETED, + status_details={"phase": TaskPhase.COMPLETED, "message": "File I/O task completed successfully"}, + ) + + return 0 + except PathTraversalError as e: + logger.error(f"Security error - path traversal detected: {e}") + if progress_reporter: + progress_reporter.update_progress( + status=PlatformJobStatus.ERROR, + error_details={"message": str(e), "type": type(e).__name__}, + ) + return 1 + except (FileDownloadError, FileUploadError) as e: + logger.exception(f"File operation failed: {e}") + if progress_reporter: + progress_reporter.update_progress( + status=PlatformJobStatus.ERROR, + error_details={"message": str(e), "type": type(e).__name__}, + ) + return 1 + + except Exception as e: + logger.exception(f"File I/O task failed: {e}") + if progress_reporter: + progress_reporter.update_progress( + status=PlatformJobStatus.ERROR, + error_details={"message": str(e), "type": type(e).__name__}, + ) + return 1 + + finally: + if sdk_owned and sdk is not None: + sdk.close() diff --git a/services/automodel/src/nmp/automodel/tasks/file_io/utils.py b/services/automodel/src/nmp/automodel/tasks/file_io/utils.py new file mode 100644 index 00000000..e8091052 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/file_io/utils.py @@ -0,0 +1,184 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import json +import logging +from collections.abc import Iterator +from contextlib import contextmanager +from pathlib import Path + +import httpx + +# https://docs.nvidia.com/nemo/microservices/latest/pysdk/index.html#handling-errors +from nemo_platform import ( + APIConnectionError, + APIStatusError, + APITimeoutError, + AuthenticationError, + PermissionDeniedError, +) +from nmp.automodel.app.jobs.file_io.schemas import ( + FileDownloadError, + FileIOTaskConfig, + FileUploadError, + PathTraversalError, + ProgressReportError, +) + +logger = logging.getLogger(__name__) + + +@contextmanager +def filesystem_sdk_error_handler( + error_class: type[FileDownloadError | FileUploadError | ProgressReportError], + operation: str, + passthrough: tuple[type[BaseException], ...] = (), +) -> Iterator[None]: + """Context manager for consistent FilesetFileSystem error handling. + + Catches FilesetFileSystem-specific exceptions and re-raises them as the specified error class + with a consistent message format. + + Args: + error_class: The exception class to raise (FileDownloadError or FileUploadError). + operation: Description of the operation for error messages (e.g., "download file.txt from fileset x/y"). + passthrough: Tuple of exception types to pass through without handling. Allows handling of exceptions outside of the context manager. + + Raises: + error_class: With a descriptive message including the error details. + + """ + try: + yield + except passthrough: + raise + except FileNotFoundError as e: + raise error_class(f"Failed to {operation} due to file not found error. Error: {e}") from e + except PermissionError as e: + raise error_class(f"Failed to {operation} due to permission denied error. Error: {e}") from e + except httpx.TimeoutException as e: + raise error_class(f"Failed to {operation} due to request timeout. Error: {e}") from e + except httpx.ConnectError as e: + raise error_class(f"Failed to {operation} due to connection error. Error: {e}") from e + except Exception as e: + raise error_class(f"Failed to {operation} due to unexpected error {type(e).__name__}: {e}") from e + + +@contextmanager +def sdk_error_handler( + error_class: type[FileDownloadError | FileUploadError | ProgressReportError], + operation: str, + passthrough: tuple[type[BaseException], ...] = (), +) -> Iterator[None]: + """Context manager for consistent SDK error handling. + + Catches SDK-specific exceptions and re-raises them as the specified error class + with a consistent message format. + + Args: + error_class: The exception class to raise (FileDownloadError or FileUploadError). + operation: Description of the operation for error messages (e.g., "download file.txt from fileset x/y"). + passthrough: Tuple of exception types to pass through without handling. Allows handling of exceptions outside of the context manager. + + Raises: + error_class: With a descriptive message including the error details. + + """ + try: + yield + except passthrough: + raise + except APITimeoutError as e: + raise error_class( + f"Failed to {operation} due to request timeout error. Cause: {e.__cause__}. Error: {e}", + ) from e + except APIConnectionError as e: + raise error_class(f"Failed to {operation} due to connection error. Cause: {e.__cause__}. Error: {e}") from e + # Note: AuthenticationError and PermissionDeniedError are subclasses of APIStatusError, + # so they must be caught before APIStatusError + except AuthenticationError as e: + raise error_class(f"Failed to {operation} due to authentication error. Error: {e}") from e + except PermissionDeniedError as e: + raise error_class(f"Failed to {operation} due to permission denied error. Error: {e}") from e + except APIStatusError as e: + raise error_class(f"Failed to {operation} due to API error. Status code: {e.status_code}. Error: {e}") from e + except Exception as e: + raise error_class(f"Failed to {operation} due to unexpected error {type(e).__name__}: {e}") from e + + +def get_config(config_path: Path) -> FileIOTaskConfig: + """Get typed task configuration from a config file. + + Loads the JSON config file and validates it against the FileIOTaskConfig schema. + + Args: + config_path: Path to the JSON configuration file. + + Returns: + Validated FileIOTaskConfig. + """ + with open(config_path) as f: + data = json.load(f) + return FileIOTaskConfig.model_validate(data) + + +def validate_storage_path(storage_path: Path) -> Path: + """Validate that a storage path exists and is a directory. + + Args: + storage_path: The storage path to validate. + + Returns: + The validated storage path. + + Raises: + FileUploadError: If the storage path does not exist or is not a directory. + """ + if not storage_path.exists() or not storage_path.is_dir(): + raise FileUploadError( + f"Storage path does not exist: {storage_path}. Ensure the storage path exists and is a directory.", + ) + return storage_path + + +def validate_safe_path(base_path: Path, user_path: str) -> Path: + """Validate that a user-provided path stays within the base directory. + + Prevents path traversal attacks where user input like "../../etc/passwd" could + escape the intended directory. The function resolves both paths to their + canonical absolute forms and verifies the result is under the base path. + + Args: + base_path: The base directory that the resolved path must stay within. + user_path: The user-provided relative path (e.g., from config). + + Returns: + The resolved absolute path that is guaranteed to be within base_path. + + Raises: + PathTraversalError: If the resolved path would escape base_path. + + Examples: + >>> base = Path("/var/storage") + >>> validate_safe_path(base, "subdir/file.txt") + PosixPath('/var/storage/subdir/file.txt') + + >>> validate_safe_path(base, "../../etc/passwd") + Raises PathTraversalError + + """ + # Resolve base_path to absolute canonical form + resolved_base = base_path.resolve() + + # Join and resolve the user path + # Using resolve() handles .., ., symlinks, etc. + resolved_path = (base_path / user_path).resolve() + + if not resolved_path.is_relative_to(resolved_base): + raise PathTraversalError( + f"Path '{user_path}' resolves outside of the base directory. " + "This may indicate a path traversal attack. " + "Ensure that paths such as ../.. are not used in the download destination path.", + ) + + return resolved_path diff --git a/services/automodel/src/nmp/automodel/tasks/model_entity/__init__.py b/services/automodel/src/nmp/automodel/tasks/model_entity/__init__.py new file mode 100644 index 00000000..49784f0f --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/model_entity/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Model entity task for creating model entities after customization.""" + +from nmp.automodel.tasks.model_entity.run import run + +__all__ = ["run"] diff --git a/services/automodel/src/nmp/automodel/tasks/model_entity/__main__.py b/services/automodel/src/nmp/automodel/tasks/model_entity/__main__.py new file mode 100644 index 00000000..90a4ffe6 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/model_entity/__main__.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Entry point for model_entity task. + +Usage: + python -m nmp.automodel.tasks.model_entity +""" + +import sys + +from .run import run + +if __name__ == "__main__": + sys.exit(run()) diff --git a/services/automodel/src/nmp/automodel/tasks/model_entity/run.py b/services/automodel/src/nmp/automodel/tasks/model_entity/run.py new file mode 100644 index 00000000..d1a6263d --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/model_entity/run.py @@ -0,0 +1,436 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Model entity task entry point. + +Handles creating model entities in the Models service after customization completes. + +The task reads configuration and creates a Model Entity that references the +uploaded model artifacts in the Files service. + +Usage: + export NEMO_JOB_STEP_CONFIG_FILE_PATH= + python -m nmp.automodel.tasks.model_entity +""" + +import json +import logging +import re +import time +from pathlib import Path + +from nemo_platform import ( + APIConnectionError, + APITimeoutError, + ConflictError, + InternalServerError, + NeMoPlatform, + NotFoundError, +) +from nemo_platform.types.inference import ( + ModelDeploymentConfig, + ModelDeploymentConfigFilterParam, + ModelDeploymentFilterParam, + NIMDeploymentParam, +) +from nemo_platform.types.models import LoraParam, ModelEntity +from nemo_platform.types.shared_params.tool_call_config import ToolCallConfig as ToolCallConfigParam +from nmp.automodel.app.constants import SERVICE_NAME +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.app.jobs.model_entity.schemas import ( + DeploymentParameters, + ModelEntityCreationError, + ModelEntityTaskConfig, +) +from nmp.automodel.entities.values import FinetuningType +from nmp.common.sdk_factory import get_task_sdk +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential + +logger = logging.getLogger(__name__) + +# Retry configuration +MAX_RETRIES = 3 +INITIAL_BACKOFF_SECONDS = 1.0 +MAX_BACKOFF_SECONDS = 30.0 + +ACTIVE_DEPLOYMENT_STATUSES = frozenset({"CREATED", "PENDING", "READY"}) + +SPEC_POLL_INTERVAL_SECONDS = 10 +SPEC_POLL_TIMEOUT_SECONDS = 600 + + +def get_config(config_path: Path) -> ModelEntityTaskConfig: + """Get typed task configuration from a config file. + + Loads the JSON config file and validates it against the ModelEntityTaskConfig schema. + + Args: + config_path: Path to the JSON configuration file. + + Returns: + Validated ModelEntityTaskConfig. + """ + with open(config_path) as f: + data = json.load(f) + return ModelEntityTaskConfig.model_validate(data) + + +def sanitize_name(prefix: str, name: str) -> str: + """Sanitize model_name: keep only allowed chars, replace invalid with hyphen, avoid consecutive/trailing hyphens + + Must be compatible with - {'pattern': '^[a-z](?!.*--)[a-z0-9\\-@.+_]{1,62}(? ModelEntity: + """Poll until the model_spec task has populated the model's spec. + + The spec must be populated before creating a deployment because the + inference service relies on ``spec.family`` and ``spec.base_num_parameters`` + to select the correct NIM profile. + + Raises: + ModelEntityCreationError: If the spec is not populated within the timeout. + """ + logger.info(f"Waiting for model_spec to populate spec on {workspace}/{name}") + start = time.monotonic() + + while time.monotonic() - start < SPEC_POLL_TIMEOUT_SECONDS: + try: + target = self.sdk.models.retrieve(name=name, workspace=workspace) + if target.spec: + logger.info(f"Spec populated on {workspace}/{name}") + return target + except (APIConnectionError, APITimeoutError, InternalServerError) as e: + logger.warning(f"Transient error polling spec for {workspace}/{name}: {e}") + time.sleep(SPEC_POLL_INTERVAL_SECONDS) + + raise ModelEntityCreationError( + f"Timed out waiting for model spec on {workspace}/{name} " + f"after {SPEC_POLL_TIMEOUT_SECONDS}s. The platform could not auto-detect the " + f"model's specifications. Verify the model checkpoint is valid and in a supported format." + ) + + def get_model_entity(self, model_entity: str, fileset_workspace: str) -> ModelEntity: + parts = model_entity.split("/") + if len(parts) == 1: + me_workspace = fileset_workspace + me_name = parts[0] + else: + me_workspace = parts[0] + me_name = parts[1] + + try: + me: ModelEntity = self.sdk.models.retrieve(name=me_name, workspace=me_workspace) + except NotFoundError as e: + raise ModelEntityCreationError(f"Model entity {me_workspace}/{me_name} not found") from e + + return me + + @retry( + stop=stop_after_attempt(MAX_RETRIES), + wait=wait_exponential(multiplier=2, min=INITIAL_BACKOFF_SECONDS, max=MAX_BACKOFF_SECONDS), + retry=retry_if_exception_type((InternalServerError, APITimeoutError, APIConnectionError)), + reraise=True, + ) + def create_model_entity(self, config: ModelEntityTaskConfig) -> tuple[dict, ModelEntity]: + """Create a model entity in the Models service. + + Args: + config: Configuration for the model entity to create. + + Returns: + Tuple of (result dict, deploy target). For LoRA the deploy target is the + base model entity; for SFT it is the newly created output model entity. + + Raises: + ModelEntityCreationError: If creation fails. + """ + workspace = self.job_ctx.workspace + logger.info(f"Creating model entity: {workspace}/{config.name}") + + fileset_workspace = config.fileset.workspace or workspace + fileset_ref = f"{fileset_workspace}/{config.fileset.name}" + + logger.info(f"Validating fileset exists: {fileset_workspace}/{config.fileset.name}") + try: + self.sdk.files.filesets.retrieve(workspace=fileset_workspace, name=config.fileset.name) + logger.info(f"Fileset validation successful: {fileset_workspace}/{config.fileset.name}") + except Exception as e: + logger.error(f"Fileset validation failed: {fileset_workspace}/{config.fileset.name}") + raise ModelEntityCreationError( + f"Cannot create model entity: fileset '{fileset_workspace}/{config.fileset.name}' does not exist or is not accessible" + ) from e + + base_me: ModelEntity = self.get_model_entity(config.model_entity, fileset_workspace) + + if config.peft is not None and config.peft.type == FinetuningType.LORA: + try: + output_me = self.sdk.models.adapters.create( + model_name=base_me.name, + workspace=base_me.workspace, + name=config.name, + description=config.description, + fileset=fileset_ref, + finetuning_type=config.peft.type.value, + lora_config=LoraParam( + alpha=config.peft.alpha, + rank=config.peft.rank, + ), + enabled=True, + ) + return output_me.model_dump(), base_me + except ConflictError: + logger.warning( + f"adapter {base_me.workspace}/{config.name} already exists for model {base_me.workspace}/{base_me.name}, updating with new fileset" + ) + try: + output_me = self.sdk.models.adapters.update( + adapter=config.name, + model_name=base_me.name, + workspace=base_me.workspace, + fileset=fileset_ref, + description=config.description, + enabled=True, + ) + logger.info( + f"Successfully updated adapter: {base_me.workspace}/{config.name} for base model {base_me.workspace}/{base_me.name}" + ) + return output_me.model_dump(), base_me + except (InternalServerError, APITimeoutError, APIConnectionError): + raise + except Exception as update_error: + logger.exception( + f"Failed to update existing adapter, {base_me.workspace}/{config.name}: {update_error}" + ) + raise ModelEntityCreationError( + f"Adapter '{config.name}' already exists but update failed: {update_error}" + ) from update_error + except Exception as e: + logger.exception(f"Failed to create model adapter: {e}") + raise ModelEntityCreationError(f"Failed to create model adapter: {e}") from e + else: + ft_type = config.peft.type.value if config.peft else FinetuningType.ALL_WEIGHTS.value + + request_body = { + "name": config.name, + "description": config.description, + "fileset": fileset_ref, + "finetuning_type": ft_type, + "trust_remote_code": base_me.trust_remote_code, + } + + if config.base_model: + request_body["base_model"] = config.base_model + + try: + output_me = self.sdk.models.create( + workspace=workspace, + **request_body, + ) + logger.info(f"Successfully created model entity: {output_me.workspace}/{output_me.name}") + return output_me.model_dump(), output_me + + except ConflictError: + logger.warning(f"Model entity already exists: {workspace}/{config.name}, updating existing model") + try: + update_body = {k: v for k, v in request_body.items() if k != "name"} + output_me = self.sdk.models.update( + name=config.name, + workspace=workspace, + **update_body, + ) + logger.info(f"Successfully updated model entity: {output_me.workspace}/{output_me.name}") + return output_me.model_dump(), output_me + except (InternalServerError, APITimeoutError, APIConnectionError): + raise + except Exception as update_error: + logger.exception(f"Failed to update existing model entity: {update_error}") + raise ModelEntityCreationError( + f"Model entity '{config.name}' already exists and update failed: {update_error}" + ) from update_error + + except Exception as e: + logger.exception(f"Failed to create model entity: {e}") + raise ModelEntityCreationError(f"Failed to create model entity: {e}") from e + + def launch_model(self, config: ModelEntityTaskConfig, me: ModelEntity): + """Deploy a model entity after creation. + + For LoRA jobs, ``me`` should be the base model entity. + For SFT jobs, ``me`` should be the output model entity. + """ + dc = config.deployment_config + if dc is None: + return + + # LORA_MERGED produces a full-weight model, so it is deployed like SFT + # and intentionally excluded from LoRA-specific checks below. + is_lora = config.peft is not None and config.peft.type == FinetuningType.LORA + if is_lora and self._has_active_deployment(me): + return + + if is_lora and isinstance(dc, DeploymentParameters) and not dc.lora_enabled: + logger.warning(f"Deployment requested but lora_enabled is false for a LoRA job: {dc}") + return + + # Resolve an existing config or create a new one from inline params. + if isinstance(dc, str): + logger.info(f"Resolving deployment config reference: {dc}") + deployment_config = self._resolve_config_ref(dc, me.workspace) + logger.info(f"Using deployment config: {deployment_config.workspace}/{deployment_config.name}") + else: + deployment_config = self._create_deployment_config(dc, me) + + self._create_deployment(deployment_config, me) + + def _has_active_deployment(self, me: ModelEntity) -> bool: + """Check if the model entity already has an active deployment.""" + deployment_configs = self.sdk.inference.deployment_configs.list( + workspace=me.workspace, + filter=ModelDeploymentConfigFilterParam(model_entity_id=f"{me.workspace}/{me.name}"), + ).data + + for c in deployment_configs: + deployments = self.sdk.inference.deployments.list( + filter=ModelDeploymentFilterParam(config=c.name, workspace=me.workspace) + ).data + for d in deployments: + if d.status in ACTIVE_DEPLOYMENT_STATUSES: + logger.info(f"Active deployment (status={d.status}) exists for config {c.name}, skipping") + return True + + return False + + def _resolve_config_ref(self, config_ref: str, me_workspace: str) -> ModelDeploymentConfig: + """Resolve a ``name`` or ``workspace/name`` reference to a ModelDeploymentConfig.""" + parts = config_ref.split("/") + if len(parts) == 2: + workspace = parts[0] + name = parts[1] + elif len(parts) == 1: + workspace = me_workspace + name = parts[0] + else: + raise ModelEntityCreationError( + f"Invalid deployment config reference '{config_ref}': expected 'name' or 'workspace/name'" + ) + + try: + return self.sdk.inference.deployment_configs.retrieve(workspace=workspace, name=name) + except Exception as e: + raise ModelEntityCreationError( + f"Failed to resolve deployment config '{config_ref}' in workspace '{workspace}': {e}" + ) from e + + def _create_deployment_config(self, deploy_params: DeploymentParameters, me: ModelEntity) -> ModelDeploymentConfig: + """Create (or update) a ModelDeploymentConfig from inline parameters.""" + nim_deployment = NIMDeploymentParam( + image_name=deploy_params.image_name, + image_tag=deploy_params.image_tag, + gpu=deploy_params.gpu, + model_name=me.name, + model_namespace=me.workspace, + additional_envs=deploy_params.additional_envs, + lora_enabled=deploy_params.lora_enabled, + ) + + if deploy_params.tool_call_config: + nim_deployment["tool_call_config"] = ToolCallConfigParam( + **deploy_params.tool_call_config.model_dump(exclude_none=True) + ) + + deployment_cfg_name = sanitize_name("sft-cfg", me.name) + try: + return self.sdk.inference.deployment_configs.create( + workspace=me.workspace, + name=deployment_cfg_name, + nim_deployment=nim_deployment, + ) + except ConflictError: + logger.info(f"Deployment config {me.workspace}/{deployment_cfg_name} already exists, updating") + return self.sdk.inference.deployment_configs.update( + workspace=me.workspace, + name=deployment_cfg_name, + nim_deployment=nim_deployment, + ) + + def _create_deployment(self, deployment_config: ModelDeploymentConfig, me: ModelEntity) -> None: + """Create a deployment from the given ModelDeploymentConfig.""" + logger.info(f"Deployment config: {deployment_config}") + + if not me.spec: + _ = self._wait_for_spec(me.workspace, me.name) + + deployment_name = sanitize_name("sft-deploy", me.name) + try: + deployment = self.sdk.inference.deployments.create( + workspace=deployment_config.workspace, + name=deployment_name, + config=deployment_config.name, + ) + logger.info(f"Deployment created: {deployment}") + except ConflictError: + logger.info(f"Deployment {deployment_config.workspace}/{deployment_name} already exists") + deployment = self.sdk.inference.deployments.retrieve( + workspace=deployment_config.workspace, + name=deployment_name, + ) + + deployment_status = self.sdk.inference.deployments.retrieve( + workspace=deployment.workspace, + name=deployment.name, + ) + logger.info(f"Deployment status: {deployment_status}") + + +def run(sdk: NeMoPlatform | None = None, job_ctx: NMPJobContext | None = None) -> int: + """Execute the model entity creation task. + + Args: + sdk: Optional SDK instance for dependency injection (for testing). + If None, creates one via get_task_sdk(). + job_ctx: Optional job context for dependency injection (for testing). + If None, creates one via NMPJobContext.from_env(). + + Returns: + Exit code (0 for success, non-zero for failure). + """ + job_ctx = job_ctx or NMPJobContext.from_env() + + sdk_owned = sdk is None + try: + sdk = sdk or get_task_sdk(SERVICE_NAME).with_options(workspace=job_ctx.workspace) + runner = ModelEntityRunner(sdk=sdk, job_ctx=job_ctx) + + config = get_config(job_ctx.config_path) + + logger.info(f"Starting model entity task with job context: {job_ctx}") + logger.info(f"Config: {config.model_dump_json(indent=2)}") + logger.info(f"NeMo Platform service URL: {sdk.base_url}") + + result, deploy_target = runner.create_model_entity(config) + logger.info(f"Model entity creation complete: {result}") + + runner.launch_model(config, deploy_target) + return 0 + + except ModelEntityCreationError as e: + logger.exception(f"Model entity creation failed: {e}") + return 1 + except Exception as e: + logger.exception(f"Model entity task failed: {e}") + return 1 + finally: + if sdk_owned and sdk is not None: + sdk.close() diff --git a/services/automodel/src/nmp/automodel/tasks/progress_reporter.py b/services/automodel/src/nmp/automodel/tasks/progress_reporter.py new file mode 100644 index 00000000..82bb2361 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/progress_reporter.py @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Re-export file_io progress types for backward-compatible imports.""" + +from nmp.automodel.tasks.file_io.progress_reporter import ( + JobsServiceProgressReporter, + NoOpProgressReporter, + ProgressReporter, +) + +__all__ = ["JobsServiceProgressReporter", "NoOpProgressReporter", "ProgressReporter"] diff --git a/services/automodel/src/nmp/automodel/tasks/training/__init__.py b/services/automodel/src/nmp/automodel/tasks/training/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/services/automodel/src/nmp/automodel/tasks/training/__main__.py b/services/automodel/src/nmp/automodel/tasks/training/__main__.py new file mode 100644 index 00000000..f4397f99 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/__main__.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Training task entry point. + +Usage: + python -m nmp.automodel.tasks.training + +In distributed (multi-node) training, all pods run this entry point. +The DistributedContext handles role detection and coordination: +- Rank 0 (coordinator): Runs all phases, reports progress +- Rank > 0 (workers): Participate in training, wait at barriers +""" + +import logging +import sys + +from .runner import TrainingRunner + +logger = logging.getLogger(__name__) + + +def run() -> int: + """Execute training task.""" + try: + with TrainingRunner() as runner: + result = runner.run() + return 0 if result.success else 1 + except Exception as e: + logger.exception(f"Training task failed: {e}") + return 1 + + +if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", + ) + sys.exit(run()) diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/__init__.py b/services/automodel/src/nmp/automodel/tasks/training/backends/__init__.py new file mode 100644 index 00000000..13ea2859 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/__init__.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import warnings + +from pydantic.warnings import UnsupportedFieldAttributeWarning + +warnings.filterwarnings("ignore", category=UnsupportedFieldAttributeWarning) + +warnings.filterwarnings( + "ignore", + category=UserWarning, + module="torch.distributed.device_mesh", +) + +warnings.filterwarnings( + "ignore", + category=UserWarning, + module="nemo_automodel.components.moe.state_dict_utils", +) diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/backend.py b/services/automodel/src/nmp/automodel/tasks/training/backends/backend.py new file mode 100644 index 00000000..d5fba7bb --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/backend.py @@ -0,0 +1,187 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import signal +import subprocess +import threading +import time +from collections import deque +from pathlib import Path +from typing import Any, Optional + +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.tasks.training.errors.parser import ( + MAX_OUTPUT_LINES, + parse_error_from_output, + read_subprocess_output, +) +from nmp.automodel.tasks.training.progress import JobsServiceProgressReporter +from nmp.automodel.tasks.training.protocol import LibraryConfig +from nmp.automodel.tasks.training.schemas import ( + CheckpointInfo, + TrainingMetrics, + TrainingStepConfig, +) +from nmp.automodel.tasks.training.utils import generate_torchrun_flags_from_env + +from .checkpoints import ModelType, find_best_checkpoint, process_checkpoint +from .config import compile_automodel_config + +logger = logging.getLogger(__name__) + +AUTOMODEL_CONFIG_FILENAME = "automodel_config.yaml" + + +class AutomodelBackend: + """Compiles and runs nemo-automodel training for customization jobs.""" + + def __init__(self, job_ctx: NMPJobContext): + self.job_ctx = job_ctx + + def compile_config( + self, + config: TrainingStepConfig, + workspace_dir: Path, + ) -> dict[str, Any]: + """ + Compile Automodel-specific configuration. + + Pure transformation - no file I/O. The runner handles writing to disk. + """ + return compile_automodel_config(config, workspace_dir, self.job_ctx) + + def execute_training( + self, + customizer_config: TrainingStepConfig, + library_config: LibraryConfig, + progress: JobsServiceProgressReporter, + ) -> TrainingMetrics: + """Execute training using CustomizerTrainFinetuneRecipe or CustomizerBiencoderRecipe. + + The config file has already been written to disk by the runner. + Progress reporting happens within the training subprocess via + TrainingProgressCallback, which reads job context from environment + variables. + """ + progress.report_running("training", backend="automodel") + + # Run training with our custom recipe + # Note: The progress parameter is not passed to run_training_with_customizer_recipe + # because progress reporting now happens inside the subprocess via + # TrainingProgressCallback using environment variables. + command = ["torchrun"] + command.extend(generate_torchrun_flags_from_env()) + command.extend( + [ + "-m", + "nmp.automodel.tasks.training.backends.finetune", + "--config", + str(library_config.config_path), + ] + ) + + logger.info(f"Executing: {' '.join(command)}") + + training_process: subprocess.Popen | None = None + + # Rolling buffer to keep recent output lines for error extraction + output_lines: deque[str] = deque(maxlen=MAX_OUTPUT_LINES) + reader_thread: threading.Thread | None = None + + def cleanup(signum, frame): + logger.warning(f"Signal {signum} received, terminating...") + if training_process: + training_process.send_signal(signum) + try: + training_process.wait(timeout=30) + except subprocess.TimeoutExpired: + training_process.kill() + raise SystemExit(signum) + + signal.signal(signal.SIGINT, cleanup) + signal.signal(signal.SIGTERM, cleanup) + + start_time = time.time() + + training_process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, # Line buffered + ) + + # Start reader thread to capture output without blocking + reader_thread = threading.Thread( + target=read_subprocess_output, + args=(training_process, output_lines), + daemon=True, + ) + reader_thread.start() + + try: + training_process.wait(timeout=customizer_config.training_timeout) + except subprocess.TimeoutExpired: + logger.exception("Training timed out") + training_process.kill() + # Reap the killed process to avoid zombies + try: + training_process.wait(timeout=30) + except subprocess.TimeoutExpired: + logger.warning( + "Killed training process did not terminate within 30s - " + "process may be stuck in uninterruptible state" + ) + # Wait for reader thread to capture any remaining output before re-raising + if reader_thread and reader_thread.is_alive(): + reader_thread.join(timeout=5) + raise # Let runner.py convert via create_error_details() + + # Wait for reader thread to finish capturing output + if reader_thread and reader_thread.is_alive(): + reader_thread.join(timeout=5) + + duration = time.time() - start_time + logger.info(f"Training finished in {duration:.1f} seconds") + + if training_process.returncode != 0: + parsed = parse_error_from_output(output_lines, training_process.returncode) + raise parsed.to_exception() + + # Return empty metrics (actual metrics are reported via callbacks during training) + # TODO: Consider parsing training logs or checkpoints to extract final metrics. + return TrainingMetrics(total_steps=0, total_epochs=0) + + def find_best_checkpoint( + self, + workspace_dir: Path, + customizer_config: TrainingStepConfig, + library_config: Optional[LibraryConfig] = None, + ) -> Path: + """Find best Automodel checkpoint.""" + model_type = ModelType.EMBEDDING if customizer_config.model.is_embedding_model else ModelType.LLM + return find_best_checkpoint(workspace_dir, customizer_config, model_type=model_type) + + def process_checkpoint( + self, + checkpoint_path: Path, + output_path: Path, + customizer_config: TrainingStepConfig, + library_config: LibraryConfig | None = None, + ) -> CheckpointInfo: + """Process Automodel checkpoint.""" + model_type = ModelType.EMBEDDING if customizer_config.model.is_embedding_model else ModelType.LLM + + # Extract resolved chat template from library config if available (LLM only) + resolved_template = None + if model_type == ModelType.LLM and library_config and library_config.config_dict: + resolved_template = library_config.config_dict.get("_resolved_chat_template") + + return process_checkpoint( + checkpoint_path, + output_path, + customizer_config, + model_type=model_type, + resolved_chat_template=resolved_template, + ) diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/callbacks.py b/services/automodel/src/nmp/automodel/tasks/training/backends/callbacks.py new file mode 100644 index 00000000..04c7b40c --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/callbacks.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging + +from nmp.automodel.tasks.training.progress import JobsServiceProgressReporter + +logger = logging.getLogger(__name__) + + +class TrainingProgressCallback: + """ + Callback for reporting Automodel training progress to the Jobs service. + + This class composes JobsServiceProgressReporter and provides training-specific + methods for reporting detailed metrics during training. + + Metric accumulation: train_loss and val_loss are accumulated as time-series + lists and included in every status_details update under a ``metrics`` key, + enabling loss-curve reconstruction from job status. + """ + + def __init__(self, reporter: JobsServiceProgressReporter): + self._reporter = reporter + + prior = reporter.fetch_current_metrics() + self._train_metrics: list[dict[str, float | int]] = prior.get("train_loss", []) + self._val_metrics: list[dict[str, float | int]] = prior.get("val_loss", []) + if self._train_metrics or self._val_metrics: + logger.info( + "Seeded metrics from server: %d train_loss, %d val_loss entries", + len(self._train_metrics), + len(self._val_metrics), + ) + + def _build_metrics_summary(self) -> dict[str, list[dict[str, float | int]]]: + """Build the accumulated metrics payload for inclusion in status_details.""" + return { + "train_loss": list(self._train_metrics), + "val_loss": list(self._val_metrics), + } + + def report_training_start(self, max_steps: int, num_epochs: int) -> None: + """Report that training has started with schedule information.""" + self._reporter.configure_progress_tracking(max_steps, num_epochs) + self._reporter.report_running(phase="training", step=0, max_steps=max_steps, num_epochs=num_epochs) + + def report_train_step( + self, + step: int, + epoch: int, + loss: float, + lr: float | None = None, + grad_norm: float | None = None, + ) -> None: + """Report training step with metrics.""" + self._train_metrics.append({"step": step, "epoch": epoch, "value": loss}) + self._reporter.report_running( + phase="training", + step=step, + epoch=epoch, + train_loss=loss, + lr=lr, + grad_norm=grad_norm, + metrics=self._build_metrics_summary(), + ) + + def report_validation(self, step: int, epoch: int, val_loss: float) -> None: + """Report validation results.""" + self._val_metrics.append({"step": step, "epoch": epoch, "value": val_loss}) + self._reporter.report_running( + phase="validation", + step=step, + epoch=epoch, + val_loss=val_loss, + metrics=self._build_metrics_summary(), + ) + + def report_checkpoint_saved(self, step: int, epoch: int, checkpoint_path: str | None = None) -> None: + """Report that a checkpoint was saved.""" + self._reporter.report_running( + phase="checkpoint_saved", + step=step, + epoch=epoch, + checkpoint_path=checkpoint_path, + ) + + def report_epoch_end(self, step: int, epoch: int) -> None: + """Report that an epoch has completed.""" + self._reporter.report_running(phase="epoch_end", step=step, epoch=epoch) + + def close(self) -> None: + """Clean up resources.""" + self._reporter.close() diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/checkpoints.py b/services/automodel/src/nmp/automodel/tasks/training/backends/checkpoints.py new file mode 100644 index 00000000..f43220fe --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/checkpoints.py @@ -0,0 +1,518 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Checkpoint processing for Automodel backend. + +This module handles: +- Finding the best checkpoint after training +- LoRA adapter merging +- Chat template preservation +- FSDP2 architecture fix +- HF export and format conversion +- ONNX export for embedding models + +Supports both LLM and embedding (biencoder) models through unified functions. +""" + +import json +import logging +import re +import shutil +from enum import StrEnum +from pathlib import Path + +from nmp.automodel.tasks.training.chat_templates import ( + apply_chat_template_to_checkpoint, + resolve_chat_template, +) +from nmp.automodel.tasks.training.schemas import ( + CheckpointFormat, + CheckpointInfo, + FinetuningType, + Precision, + TrainingStepConfig, +) + +logger = logging.getLogger(__name__) + + +class ModelType(StrEnum): + """Type of model for checkpoint processing.""" + + LLM = "llm" + EMBEDDING = "embedding" + + +def extract_precision_from_model_config(model_path: str | Path) -> Precision | None: + """ + Extract precision from a HuggingFace model's config.json. + + HuggingFace models store their torch_dtype in config.json (e.g., "bfloat16"). + This function reads that value and maps it to our Precision enum. + + This is used to determine the actual training precision when "auto" was used + for torch_dtype. The precision comes from the base model's config, not from + the output checkpoint (which may only contain adapter weights for LoRA). + + Args: + model_path: Path to the model directory containing config.json + + Returns: + Precision enum value if found, None otherwise + """ + config_path = Path(model_path) / "config.json" + if not config_path.exists(): + logger.warning(f"config.json not found at {config_path}, cannot extract precision") + return None + + try: + with open(config_path, "r") as f: + config = json.load(f) + + torch_dtype = config.get("torch_dtype") + if torch_dtype is None: + logger.warning("torch_dtype not found in config.json") + return None + + try: + precision = Precision.from_hf_dtype(torch_dtype) + logger.info(f"Extracted precision from model config: {torch_dtype} -> {precision.value}") + return precision + except ValueError: + logger.warning(f"Unknown torch_dtype '{torch_dtype}' in config.json, cannot map to Precision") + return None + + except (json.JSONDecodeError, IOError) as e: + logger.warning(f"Failed to read config.json: {e}") + return None + + +def extract_step_number(path: Path) -> int: + """Extract step number from directory name like 'epoch_0_step_99'""" + match = re.search(r"step_(\d+)", path.name) + return int(match.group(1)) if match else -1 + + +def get_model_dir_from_checkpoint(checkpoint_dir: Path, is_peft: bool) -> Path: + """ + Extract model directory from checkpoint directory. + """ + if is_peft: + # For LoRA, checkpoint is saved directly under model/ directory + model_dir = checkpoint_dir / "model" + if model_dir.exists() and model_dir.is_dir(): + logger.info(f"Found LoRA checkpoint at: {model_dir}") + return model_dir.resolve() + else: + # For full-sft, check for consolidated directory first + consolidated_dir = checkpoint_dir / "model" / "consolidated" + if consolidated_dir.exists() and consolidated_dir.is_dir(): + logger.info(f"Found consolidated checkpoint at: {consolidated_dir}") + return consolidated_dir.resolve() + + # Fallback to model/ directory if consolidated doesn't exist + model_dir = checkpoint_dir / "model" + if model_dir.exists() and model_dir.is_dir(): + logger.info(f"Found sharded checkpoint at: {model_dir}") + return model_dir.resolve() + + raise FileNotFoundError(f"Model directory not found in checkpoint {checkpoint_dir}") + + +def find_best_checkpoint( + workspace_dir: Path, + config: TrainingStepConfig, + model_type: ModelType = ModelType.LLM, +) -> Path: + """ + Find the best checkpoint directory. + """ + base_dir = workspace_dir / "checkpoints" + is_peft = config.training.finetuning_type in (FinetuningType.LORA, FinetuningType.LORA_MERGED) + type_label = "embedding" if model_type == ModelType.EMBEDDING else "" + + # Order of preference: + # 1. LOWEST_VAL symlink + # 2. LATEST symlink + # 3. Highest step number + + for link_name in ["LOWEST_VAL", "LATEST"]: + link = base_dir / link_name + if link.exists() and link.is_symlink(): + try: + target = link.resolve() + if target.exists(): + logger.info(f"Using {link_name} {type_label} checkpoint: {target.name}".replace(" ", " ")) + return get_model_dir_from_checkpoint(target, is_peft) + except Exception as e: + logger.warning(f"Failed to resolve {link_name} symlink: {e}") + + # Fallback: scan directories + epoch_step_dirs = list(base_dir.glob("epoch_*_step_*")) + if not epoch_step_dirs: + raise FileNotFoundError(f"No {type_label} checkpoint directories found in {base_dir}".replace(" ", " ")) + + best_checkpoint = max(epoch_step_dirs, key=extract_step_number) + logger.info(f"Using latest {type_label} checkpoint by step number: {best_checkpoint.name}".replace(" ", " ")) + return get_model_dir_from_checkpoint(best_checkpoint, is_peft) + + +def fix_fsdp2_architecture(model_path: Path) -> None: + """ + Fix FSDP2 architecture naming issue in HuggingFace config. + + FSDP2 adds "FSDP" prefix to architecture names (e.g., "FSDPLlamaForCausalLM" + instead of "LlamaForCausalLM"). This function removes that prefix to ensure + the checkpoint is compatible with standard HuggingFace/vLLM loading. + + Reference: https://github.com/huggingface/transformers/commit/dc262ee6f57f2154f5233e53482da14dbe3be834 + """ + config_path = model_path / "config.json" + if not config_path.exists(): + logger.warning(f"config.json not found at {config_path}, skipping FSDP2 fix") + return + + with open(config_path, "r") as f: + config = json.load(f) + + if "architectures" not in config: + return + + original_archs = config["architectures"] + fixed_archs = [arch.removeprefix("FSDP") for arch in original_archs] + + if original_archs != fixed_archs: + config["architectures"] = fixed_archs + with open(config_path, "w") as f: + json.dump(config, f, indent=2) + logger.info(f"Fixed FSDP2 architecture names: {original_archs} -> {fixed_archs}") + + +def merge_lora_adapter( + adapter_path: Path, + base_model_path: str, + output_path: Path, +) -> None: + """ + Merge LoRA adapter weights into the base model. + + Uses HuggingFace's PEFT library to: + 1. Load the base model + 2. Attach the LoRA adapter + 3. Merge weights using merge_and_unload() + 4. Save as a standard HuggingFace checkpoint + + Note: This function only supports LLM models. For embedding models, + use merge_lora_embedding_adapter() instead. + + Args: + adapter_path: Path to the LoRA adapter checkpoint + base_model_path: Path to the base model (for loading weights) + output_path: Where to save the merged model + """ + try: + import torch + from peft import PeftModel + from transformers import AutoModelForCausalLM, AutoTokenizer + except ImportError as e: + raise ImportError( + "LoRA merge requires 'peft' and 'transformers' packages. Ensure they are installed in the container." + ) from e + + logger.info(f"Merging LoRA adapter from {adapter_path} with base model {base_model_path}") + + # Use scratch directory if available for better I/O performance + tmp_path = Path("/scratch/merged_lora") if Path("/scratch").is_dir() else Path("/tmp/merged_lora") + shutil.rmtree(tmp_path, ignore_errors=True) + tmp_path.mkdir(parents=True, exist_ok=True) + + try: + # 1. Load base model in mergeable dtype (not quantized) + logger.info("Loading base model...") + model = AutoModelForCausalLM.from_pretrained( + base_model_path, + torch_dtype=torch.bfloat16, + device_map="auto", + trust_remote_code=True, + ) + + # 2. Attach the LoRA adapter + logger.info("Loading LoRA adapter...") + model = PeftModel.from_pretrained(model, str(adapter_path)) + + # 3. Merge LoRA weights into base model + logger.info("Merging LoRA weights...") + model = model.merge_and_unload() + + # 4. Save merged model + logger.info(f"Saving merged model to {tmp_path}...") + model.save_pretrained(tmp_path, safe_serialization=True) + + # 5. Save tokenizer from base model + tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True) + tokenizer.save_pretrained(tmp_path) + + # 6. Copy to output path + output_path.mkdir(parents=True, exist_ok=True) + shutil.copytree(tmp_path, output_path, dirs_exist_ok=True) + + logger.info(f"Successfully merged LoRA adapter to {output_path}") + + finally: + # Cleanup temp directory + shutil.rmtree(tmp_path, ignore_errors=True) + + +def merge_lora_embedding_adapter( + adapter_path: Path, + base_model_path: str, + output_path: Path, +) -> None: + """Merge a LoRA adapter into a base embedding model. + + This intentionally mirrors the logic in Automodel's `tools/merge_lora.py`, + but is implemented locally because the customizer container may not have + that module on `PYTHONPATH`. + + Args: + adapter_path: Path to the PEFT adapter directory. + base_model_path: HuggingFace model name or path for the base encoder. + output_path: Where to write the merged model. + """ + try: + import gc + + import torch + from peft import PeftModel + from transformers import AutoModel, AutoTokenizer + except ImportError as e: + raise ImportError( + "LoRA merge requires 'peft' and 'transformers' packages. Ensure they are installed in the container." + ) from e + + logger.info("Merging embedding LoRA adapter from %s with base model %s", adapter_path, base_model_path) + + # Use scratch directory if available for better I/O performance + tmp_path = Path("/scratch/merged_lora") if Path("/scratch").is_dir() else Path("/tmp/merged_lora") + shutil.rmtree(tmp_path, ignore_errors=True) + tmp_path.mkdir(parents=True, exist_ok=True) + model = None + try: + logger.info("Loading base model (AutoModel): %s", base_model_path) + model = AutoModel.from_pretrained( + base_model_path, + torch_dtype=torch.float16, + device_map="auto", + trust_remote_code=True, + ) + + logger.info("Loading adapter from %s", adapter_path) + model = PeftModel.from_pretrained(model, str(adapter_path)) + + logger.info("Merging adapter into base model") + model = model.merge_and_unload() + + logger.info("Saving merged model to %s", tmp_path) + model.save_pretrained(tmp_path, safe_serialization=True) + + try: + tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True) + tokenizer.save_pretrained(tmp_path) + logger.info("Tokenizer saved to %s", tmp_path) + except Exception as e: + logger.warning("Could not save tokenizer: %s", e) + + output_path.mkdir(parents=True, exist_ok=True) + shutil.copytree(tmp_path, output_path, dirs_exist_ok=True) + logger.info("Successfully merged embedding LoRA adapter to %s", output_path) + + finally: + shutil.rmtree(tmp_path, ignore_errors=True) + torch.cuda.empty_cache() + gc.collect() + + +def export_onnx( + model_path: Path, + output_path: Path, + tokenizer_path: str, +) -> Path: + """Export an embedding model to ONNX format. + + Uses Automodel's export_to_onnx to export to ONNX format. + The resulting `model.onnx` is written into *output_path* alongside + the existing HuggingFace checkpoint files. + + Args: + model_path: Path to the HuggingFace model directory (config.json + weights). + output_path: Directory where ``model.onnx`` will be written. + tokenizer_path: Fallback tokenizer location (base model path). Used when + the checkpoint directory does not contain tokenizer files. + + Returns: + Path to the exported ``model.onnx`` file. + """ + # need to import here for the tests + from nemo_automodel.components.models.biencoder.export_onnx import export_to_onnx + + logger.info(f"Exporting embedding model at path {model_path} to ONNX format at path {output_path}") + + try: + onnx_path = export_to_onnx( + model_path=str(model_path), + output_dir=str(output_path), + tokenizer_path=tokenizer_path, + pooling="avg", + normalize=True, + opset=17, + export_dtype="fp16", + verify=True, + ) + except Exception: + logger.exception(f"ONNX export failed for model at {model_path}") + raise + + logger.info(f"ONNX model exported to {onnx_path}") + return Path(onnx_path) + + +_ONNX_TOP_LEVEL_PATTERNS = {"model.onnx", "model.onnx.data", "tokenizer"} + + +def _restructure_embedding_output(output_path: Path) -> None: + """Move HF artifacts into ``alternates/hf/`` so the NIM selects the ONNX profile. + + NIM scans the top-level directory to choose the model backend. If it sees + ``.safetensors`` files it creates a PyTorch profile, which is unsupported + for custom models in many NIM versions. The legacy customizer kept only + ``model.onnx`` (+ tokenizer/) at the root and placed HF weights under + ``alternates/hf/``. This function reproduces that layout. + """ + alternates_hf = output_path / "alternates" / "hf" + alternates_hf.mkdir(parents=True, exist_ok=True) + + for entry in list(output_path.iterdir()): + if entry.name in _ONNX_TOP_LEVEL_PATTERNS or entry.name == "alternates": + continue + dest = alternates_hf / entry.name + logger.info("Moving %s -> %s", entry, dest) + shutil.move(str(entry), str(dest)) + + logger.info("Restructured embedding output: ONNX at top level, HF in alternates/hf/") + + +def process_checkpoint( + checkpoint_path: Path, + output_path: Path, + customizer_config: TrainingStepConfig, + model_type: ModelType = ModelType.LLM, + resolved_chat_template: str | None = None, +) -> CheckpointInfo: + """ + Process checkpoint to standard output format. + + Works for both LLM and embedding (biencoder) models. + + Handles three scenarios: + 1. Full weights training: Copy checkpoint, fix FSDP2 arch, preserve chat template (LLM only) + 2. LoRA (unmerged): Copy adapter, preserve format as hf-peft + 3. LoRA merged: Merge adapter with base model, output as standard HF + + Args: + checkpoint_path: Path to the checkpoint directory (model files) + output_path: Where to write the processed checkpoint + customizer_config: Training configuration with model paths and settings + model_type: Type of model ("llm" or "embedding") + resolved_chat_template: Pre-resolved chat template from training config (LLM only). + If provided, this template is used. Otherwise, falls back to + priority-based resolution using model.name and model.path. + + Returns: + CheckpointInfo with output path, format, and precision + """ + output_path.mkdir(parents=True, exist_ok=True) + + finetuning_type = customizer_config.training.finetuning_type + base_model_path = customizer_config.model.path + is_embedding = model_type == ModelType.EMBEDDING + type_label = "embedding" if is_embedding else "" + + # Resolve chat template using the same priority logic as training: + # 1. Use pre-resolved template if provided (ensures consistency with training) + # 2. Otherwise, resolve using priority-based selection + chat_template: str | None = None + if not is_embedding: + if resolved_chat_template is not None: + chat_template = resolved_chat_template + logger.info("Using pre-resolved chat template from training config") + else: + # Fall back to priority-based resolution (user_template from fileset metadata takes priority) + chat_template = resolve_chat_template( + model_path=base_model_path, + model_name=customizer_config.model.name, + user_template=customizer_config.model.chat_template, + ) + + if finetuning_type == FinetuningType.LORA_MERGED: + # LoRA merged: merge adapter weights into base model + # For embedding models, this produces a full-weight model compatible with ONNX export and NIM serving. + if is_embedding: + merge_lora_embedding_adapter( + adapter_path=checkpoint_path, + base_model_path=base_model_path, + output_path=output_path, + ) + else: + merge_lora_adapter( + adapter_path=checkpoint_path, + base_model_path=base_model_path, + output_path=output_path, + ) + checkpoint_format = CheckpointFormat.HF + + # Fix FSDP2 architecture naming + fix_fsdp2_architecture(output_path) + # Apply chat template for LLM models only + if chat_template: + apply_chat_template_to_checkpoint(output_path, chat_template) + + elif finetuning_type == FinetuningType.LORA: + # LoRA unmerged: just copy the adapter files + logger.info(f"Copying {type_label} LoRA adapter from {checkpoint_path} to {output_path}".replace(" ", " ")) + shutil.copytree(checkpoint_path, output_path, dirs_exist_ok=True) + checkpoint_format = CheckpointFormat.HF_PEFT + # Note: For hf-peft, chat template is inherited from base model at inference time + + else: + # Full weights training: copy and process + logger.info( + f"Copying {type_label} full weights checkpoint from {checkpoint_path} to {output_path}".replace(" ", " ") + ) + shutil.copytree(checkpoint_path, output_path, dirs_exist_ok=True) + checkpoint_format = CheckpointFormat.HF + + # Fix FSDP2 architecture naming + fix_fsdp2_architecture(output_path) + # Apply chat template for LLM models only + if chat_template: + apply_chat_template_to_checkpoint(output_path, chat_template) + + if is_embedding: + export_onnx( + model_path=output_path, + output_path=output_path, + tokenizer_path=base_model_path, + ) + _restructure_embedding_output(output_path) + + # Determine precision: use explicit config value, or extract from base model + precision = customizer_config.model.precision + if precision is None: + precision = extract_precision_from_model_config(customizer_config.model.path) + + return CheckpointInfo( + path=str(output_path), + format=checkpoint_format, + precision=precision, + ) diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/config.py b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py new file mode 100644 index 00000000..b51fba84 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py @@ -0,0 +1,848 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Automodel configuration compiler. + +This module transforms the standardized TrainingStepConfig into the format +expected by nemo_automodel's TrainFinetuneRecipeForNextTokenPrediction +or KnowledgeDistillationRecipeForNextTokenPrediction. +""" + +import logging +import os +from pathlib import Path +from typing import Any + +from nemo_automodel._transformers.registry import ModelRegistry +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.tasks.training.chat_templates import resolve_chat_template +from nmp.automodel.tasks.training.datasets.preparation import ( + DatasetSchema, + PreparedDataset, + compute_val_check_interval, + detect_dataset_schema, + prepare_dataset, +) +from nmp.automodel.tasks.training.datasets.validation import DatasetValidator +from nmp.automodel.tasks.training.integrations import ( + build_mlflow_config, + build_wandb_config, +) +from nmp.automodel.tasks.training.schemas import ( + EmbeddingConfig, + FinetuningType, + LoRAConfig, + TrainingStepConfig, + TrainingType, +) +from nmp.automodel.tasks.training.sequence_packing import ( + calculate_optimal_pack_size, + estimate_dataset_sequence_lengths, +) + +logger = logging.getLogger(__name__) + + +def compile_automodel_config( + customizer_config: TrainingStepConfig, + workspace_dir: Path, + job_ctx: NMPJobContext, +) -> dict[str, Any]: + """ + Compile Automodel-specific configuration. + + This transforms the standardized TrainingStepConfig into the format + expected by nemo_automodel's TrainFinetuneRecipeForNextTokenPrediction. + """ + cfg: dict[str, Any] = {} + _is_embedding_model = customizer_config.model.is_embedding_model + trust_remote_code = customizer_config.model.trust_remote_code + embedding_config = EmbeddingConfig() + + # === Distributed Environment === + # Required for torch.distributed initialization + cfg["dist_env"] = { + "backend": "nccl", + "timeout_minutes": 30, # Higher timeout for large model loading + } + + # === Random Number Generator === + # Both recipes use StatefulRNG for reproducibility across restarts and multi-node training, + # but they expect the config in different formats: + # - Biencoder recipe: expects cfg["seed"] and creates StatefulRNG internally + # - LLM recipe: expects cfg["rng"] with full StatefulRNG config + seed = int(os.environ.get("PL_GLOBAL_SEED", customizer_config.seed)) + + if _is_embedding_model: + # Biencoder recipe creates StatefulRNG from seed value internally + # See: nemo_automodel/recipes/biencoder/train_biencoder.py + cfg["seed"] = seed + else: + # LLM recipe expects the full rng config object + cfg["rng"] = { + "_target_": "nemo_automodel.components.training.rng.StatefulRNG", + "seed": seed, + "ranked": True, # Different seed per rank for data augmentation + } + + # === Model Configuration === + # Common fields shared by both embedding and causal LM models + cfg["model"] = { + "pretrained_model_name_or_path": customizer_config.model.path, + "torch_dtype": customizer_config.model.precision.to_torch_dtype() + if customizer_config.model.precision + else "auto", + # trust_remote_code is required for models like nvidia/llama-nemotron-embed-1b-v2 + # which use custom model_type "llama_bidirec" with custom modeling code. + "trust_remote_code": trust_remote_code, + } + if customizer_config.model.override_custom_impl: + cfg["model"]["force_hf"] = True + + if _is_embedding_model: + cfg["model"].update( + { + "_target_": "nemo_automodel.components.models.biencoder.NeMoAutoModelBiencoder.from_pretrained", + # Use the same encoder for both queries and passages. default value taken from Automodel example + "share_encoder": True, + # Add a trainable linear layer after pooling to reduce embedding dimension. default value taken from Automodel example + "add_linear_pooler": False, + # How to combine token embeddings into a single document/query embedding. default value taken from Automodel example + "pooling": "avg", + # Normalize embeddings to unit length (length = 1). default value taken from Automodel example + "l2_normalize": True, + # When training an embedding model, we want it to learn that similar things should have similar embeddings + # and different things should have different embeddings. + # Temperature controls how "strict" the model is when learning these relationships. + # Low value (0.02), tells the model to pick the correct doc and penalizes near-misses. + # High value (like 1.0 that's Automodel default) tells the model to be more lenient and allows for near-misses. + # 0.02 is taken from the Automodel example for biencoder training. + "t": 0.02, + # Total number of passages per query during training: 1 positive + (n-1) negatives. + # For example, train_n_passages=5 means 1 positive and 4 negative passages per query. + # This differs from legacy Customizer's 'num_hard_negatives' which only counted negatives + # (num_hard_negatives=4 is equivalent to train_n_passages=5). + "train_n_passages": embedding_config.train_n_passages, + # Number of negative passages per query during validation. + "eval_negative_size": get_eval_negative_size(embedding_config), + # Gradient checkpointing saves memory by not storing all activations during forward pass. + # Instead, it recomputes them during backward pass with a memory trade-off - less memory, slower training. + # Useful for large models or limited GPU memory. + # TODO: consider exposing this in CustomizationJobInput + "do_gradient_checkpointing": embedding_config.do_gradient_checkpointing, + "use_liger_kernel": True, + "use_sdpa_patching": True, + } + ) + + # === Tokenizer === + cfg["tokenizer"] = { + "_target_": "nemo_automodel._transformers.auto_tokenizer.NeMoAutoTokenizer.from_pretrained", + "pretrained_model_name_or_path": customizer_config.model.path, + } + else: + cfg["model"].update( + { + "_target_": "nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained", + "attn_implementation": customizer_config.model.attn_implementation, + } + ) + + # === Distributed Configuration === + p = customizer_config.parallelism + total_gpus = p.num_nodes * p.num_gpus_per_node + # Note dp_size is typically auto-derived by Automodel (world_size / (tp * pp * cp)), + # but we calculate it explicitly here because: + # 1. It's validated upstream in validators.py + # 2. We need it for warmup_steps validation below + # 3. Passing an explicit value ensures consistency rather than relying on Automodel's derivation + dp = total_gpus // (p.tensor_parallel_size * p.pipeline_parallel_size * p.context_parallel_size) + + cfg["distributed"] = { + "_target_": "nemo_automodel.components.distributed.fsdp2.FSDP2Manager", + "dp_size": dp, + "tp_size": p.tensor_parallel_size, + "pp_size": p.pipeline_parallel_size, + "cp_size": p.context_parallel_size, + "ep_size": p.expert_parallel_size, + "sequence_parallel": p.sequence_parallel, + } + if p.pipeline_parallel_size > 1: + cfg["distributed"]["pipeline"] = { + "pp_schedule": "interleaved1f1b", + "pp_microbatch_size": 1, + "scale_grads_in_schedule": False, + } + + # === Dataset Preparation === + # Discover, merge, and optionally split dataset files + prepared = prepare_dataset( + dataset_path=Path(customizer_config.dataset.path), + output_dir=workspace_dir / "dataset", + seed=customizer_config.seed, + ) + logger.info( + f"Prepared dataset: train={prepared.train_samples} samples, validation={prepared.validation_samples} samples, files: " + f"train={prepared.train_file.absolute()}, validation={prepared.validation_file.absolute()}" + ) + validator = DatasetValidator(training_type=customizer_config.training.training_type) + validator.validate_dataset(str(prepared.train_file)) + validator.validate_dataset(str(prepared.validation_file)) + logger.info("Validated datasets successfully") + + # === Step Scheduler (with val_check_interval conversion) === + batch_size = customizer_config.batch.global_batch_size + epochs = customizer_config.schedule.epochs + + # Compute steps per epoch (round up to ensure all samples are used) + steps_per_epoch = (prepared.train_samples + batch_size - 1) // batch_size + total_steps = steps_per_epoch * epochs + + # Determine effective max_steps + user_max_steps = customizer_config.schedule.max_steps + if user_max_steps and user_max_steps > 0: + max_steps = min(user_max_steps, total_steps) + else: + max_steps = total_steps + + logger.info( + f"Training schedule: {prepared.train_samples} samples, batch_size={batch_size}, " + f"steps_per_epoch={steps_per_epoch}, epochs={epochs}, max_steps={max_steps}" + ) + + cfg["step_scheduler"] = { + "global_batch_size": batch_size, + "local_batch_size": customizer_config.batch.micro_batch_size, + "max_steps": max_steps, + "num_epochs": epochs, + } + + val_every_steps = compute_val_check_interval( + steps_per_epoch=steps_per_epoch, + max_steps=max_steps, + val_check_interval=customizer_config.schedule.val_check_interval, + ) + cfg["step_scheduler"]["val_every_steps"] = val_every_steps + cfg["step_scheduler"]["ckpt_every_steps"] = val_every_steps + logger.info(f"Validation interval: {customizer_config.schedule.val_check_interval} -> {val_every_steps} steps") + + # === Validate warmup_steps === + # Automodel requires: lr_warmup_steps < lr_decay_steps (scheduler.py line 96) + # lr_decay_steps = total_optimizer_steps (accounting for gradient accumulation) + warmup_steps = customizer_config.optimizer.warmup_steps + if warmup_steps > 0: + micro_batch_size = customizer_config.batch.micro_batch_size + + # Calculate gradient accumulation steps (how StepScheduler computes it) + grad_acc_steps = batch_size // (micro_batch_size * dp) + + # Calculate total optimizer steps (accounting for gradient accumulation) + total_optimizer_steps = (epochs * prepared.train_samples) // grad_acc_steps + + # lr_decay_steps will be min(max_steps, total_optimizer_steps) + lr_decay_steps = min(total_optimizer_steps, max_steps) + + if warmup_steps >= lr_decay_steps: + raise ValueError( + f"warmup_steps ({warmup_steps}) must be less than lr_decay_steps ({lr_decay_steps}). " + f"Calculation: grad_acc_steps={grad_acc_steps} (batch_size={batch_size} / " + f"(micro_batch_size={micro_batch_size} * dp_size={dp})), " + f"total_optimizer_steps={total_optimizer_steps} (epochs={epochs} * " + f"steps_per_epoch={prepared.train_samples} / grad_acc_steps={grad_acc_steps}), " + f"lr_decay_steps=min({total_optimizer_steps}, {max_steps})={lr_decay_steps}" + ) + + # === Optimizer === + cfg["optimizer"] = { + "_target_": "torch.optim.Adam", + "lr": customizer_config.optimizer.learning_rate, + "weight_decay": customizer_config.optimizer.weight_decay, + "betas": [customizer_config.optimizer.beta1, customizer_config.optimizer.beta2], + "eps": customizer_config.optimizer.eps, # Adam epsilon for numerical stability + } + + cfg["lr_scheduler"] = { + "lr_decay_style": "cosine", + "lr_warmup_steps": customizer_config.optimizer.warmup_steps, + } + if customizer_config.optimizer.min_learning_rate: + cfg["lr_scheduler"]["min_lr"] = customizer_config.optimizer.min_learning_rate + + # === Checkpoint === + cfg["checkpoint"] = { + "enabled": True, + "model_save_format": "safetensors", + "checkpoint_dir": str(workspace_dir / "checkpoints"), + "save_consolidated": True, + # Required for models with quantized base weights (e.g., GPT-OSS) + # Safe to enable even for non-quantized models + "dequantize_base_checkpoint": True, + "v4_compatible": customizer_config.model.v4_compatible, + } + + # === Sequence Packing (must be computed before dataset config) === + # When packing is enabled, we use the pack size as the effective sequence length + # for dataset configuration. This ensures samples are truncated appropriately. + effective_seq_length = customizer_config.model.max_seq_length + if not _is_embedding_model: + if customizer_config.batch.sequence_packing: + # Calculate optimal pack size based on dataset statistics + packing_estimate = estimate_dataset_sequence_lengths( + customizer_config, + train_file=prepared.train_file, + max_samples=customizer_config.batch.sequence_packing_max_samples, + seed=customizer_config.seed, + trust_remote_code=trust_remote_code, + ) + + if packing_estimate is not None: + optimal_pack_size = packing_estimate.pack_size + logger.info( + f"Sequence packing enabled: pack_size={optimal_pack_size}, " + f"avg_seq={packing_estimate.avg_seq_length}, max_seq={packing_estimate.max_seq_length}, " + f"packing_factor={packing_estimate.packing_factor}, samples={packing_estimate.samples_analyzed}" + ) + else: + # Fallback to conservative default (model max_seq_length) + optimal_pack_size = calculate_optimal_pack_size(customizer_config) + logger.info(f"Sequence packing enabled with conservative pack_size={optimal_pack_size}") + + cfg["packed_sequence"] = { + "packed_sequence_size": optimal_pack_size, + "split_across_pack": False, + } + + # Use pack size as the effective sequence length for datasets + effective_seq_length = optimal_pack_size + + # === Dataset Configuration (with schema detection) === + _configure_datasets( + cfg, + customizer_config, + prepared, + effective_seq_length, + seed, + _is_embedding_model, + embedding_config, + ) + + # === Dataloader === + # Embedding datasets configure their own specialized dataloaders in _configure_embedding_dataset + if not _is_embedding_model: + cfg["dataloader"] = { + "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", + "collate_fn": "nemo_automodel.components.datasets.utils.default_collater", + "shuffle": True, + } + cfg["validation_dataloader"] = { + "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", + "collate_fn": "nemo_automodel.components.datasets.utils.default_collater", + } + + # === PEFT (LoRA) === + if customizer_config.training.training_type in ( + TrainingType.SFT, + TrainingType.DISTILLATION, + ) and customizer_config.training.finetuning_type in (FinetuningType.LORA, FinetuningType.LORA_MERGED): + lora = customizer_config.training.lora + if lora is None: + lora = LoRAConfig() + peft_cfg: dict[str, Any] = { + "_target_": "nemo_automodel.components._peft.lora.PeftConfig", + "dim": lora.rank, + "alpha": lora.alpha, + "dropout": lora.dropout, + "use_triton": lora.use_triton, + "target_modules": lora.target_modules, + } + # TODO: Support exclude_modules via the API + # if lora.exclude_modules: + # peft_cfg["exclude_modules"] = lora.exclude_modules + cfg["peft"] = peft_cfg + + # === Loss === + if not _is_embedding_model: + cfg["loss_fn"] = { + "_target_": "nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy", + } + + # === Custom Model Configuration === + # Check for custom Automodel implementations (e.g., MoE models) + # and configure backend/parallelizer settings + if not _is_embedding_model: + _configure_moe_backend(cfg, customizer_config, trust_remote_code=trust_remote_code) + + # === Knowledge Distillation === + if customizer_config.training.training_type == TrainingType.DISTILLATION: + _configure_kd(cfg, customizer_config, trust_remote_code=trust_remote_code) + + # === Integrations (Runtime Environment) === + + # WandB - check for API key in environment + wandb_config = build_wandb_config( + customizer_config=customizer_config, + job_ctx=job_ctx, + framework="automodel", + ) + if wandb_config: + cfg["wandb"] = wandb_config + logger.info(f"WandB enabled: project={wandb_config.get('project')}") + + # MLflow + mlflow_config = build_mlflow_config( + customizer_config=customizer_config, + job_ctx=job_ctx, + framework="automodel", + ) + if mlflow_config: + cfg["mlflow"] = mlflow_config + logger.info(f"MLflow enabled: {mlflow_config.get('tracking_uri')}") + + return cfg + + +def _configure_moe_backend( + cfg: dict[str, Any], customizer_config: TrainingStepConfig, trust_remote_code: bool = False +) -> None: + """ + Configure custom Automodel model implementations for MoE models. + + Automodel has optimized implementations for certain model architectures. + Only MoE models (those with num_local_experts, num_experts, or n_routed_experts in config) + require additional backend and parallelizer configuration. + + Dense models like LlamaForCausalLM may have custom Automodel implementations + (for combined QKV projections, etc.) but don't need MoE-specific config. + + This function: + 1. Detects if the model is an MoE model via config attributes + 2. Only for MoE: Configures the backend (with deepep disabled for stability) + 3. Only for MoE: Configures the parallelizer for expert distribution + """ + # Import here to avoid ModuleNotFoundError in environments where + # transformers is not installed (e.g., during test collection) + from transformers import AutoConfig + + model_path = customizer_config.model.path + + try: + hf_config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code) + architectures = getattr(hf_config, "architectures", None) + + # Check if model has a custom Automodel implementation + has_custom_impl = ( + architectures and len(architectures) > 0 and architectures[0] in ModelRegistry.model_arch_name_to_cls + ) + + if has_custom_impl: + # Check if model is MoE by looking for expert-related config attributes + # MoE models use num_local_experts (Mixtral-style), num_experts (older), or n_routed_experts (NemotronH) + num_experts = ( + getattr(hf_config, "num_local_experts", None) + or getattr(hf_config, "num_experts", None) + or getattr(hf_config, "n_routed_experts", None) + ) + is_moe_model = num_experts is not None and num_experts > 1 + if is_moe_model: + logger.info( + f"Detected MoE model with custom Automodel implementation for architecture: {architectures[0]}. " + f"Adding MoE-specific configurations (num_experts={num_experts})." + ) + + # Validate MoE parallelism constraints. + # Automodel's MoE parallelizer does not support tensor parallelism: + # assert tp_axis_name is None or world_mesh[tp_axis_name].size() == 1 + # See: nemo_automodel/components/moe/parallelizer.py + p = customizer_config.parallelism + total_gpus = p.num_nodes * p.num_gpus_per_node + if total_gpus > 1: + if p.tensor_parallel_size > 1: + raise ValueError( + f"Tensor parallelism (tensor_parallel_size={p.tensor_parallel_size}) is not supported for MoE models." + ) + ep = p.expert_parallel_size + if ep is None or ep <= 1: + raise ValueError( + f"MoE model detected (num_experts={num_experts}) but expert_parallel_size " + f"is {ep or 'not set'}. Multi-GPU MoE training requires expert_parallel_size > 1." + ) + + # Backend configuration for MoE models + # DeepEP is disabled for stability - it's a newer feature that can cause issues + cfg.setdefault("model", {})["backend"] = { + "_target_": "nemo_automodel.components.models.common.utils.BackendConfig", + "enable_deepep": False, + } + + else: + logger.info( + f"Detected custom Automodel implementation for architecture: {architectures[0]}. " + "Not an MoE model, skipping MoE-specific configurations." + ) + else: + logger.debug( + f"No custom Automodel implementation found for {model_path}. " + "Using standard HuggingFace model implementation." + ) + except ValueError: + raise # Re-raise validation errors + except Exception as e: + # Don't fail training if we can't check for custom implementations + logger.warning( + f"Failed to check for custom model implementation: {e}. Using standard HuggingFace model implementation." + ) + + +def _configure_datasets( + cfg: dict[str, Any], + customizer_config: TrainingStepConfig, + prepared: PreparedDataset, + seq_length: int, + seed: int, + is_embedding_model: bool = False, + embedding_config: EmbeddingConfig | None = None, +) -> None: + """ + Configure dataset sections based on detected schema. + + Supports: + - Chat format (OpenAI messages): Uses ChatDataset + - SFT format (prompt/completion): Uses ColumnMappedTextInstructionDataset + - Custom format (via prompt_template): Uses ColumnMappedTextInstructionDataset with custom columns + - Embedding format (query/pos_doc/neg_doc): Uses inline retrieval dataset + + Args: + cfg: Configuration dictionary to populate. + customizer_config: Training step configuration. + prepared: Prepared dataset with merged train/val files. + seq_length: Effective sequence length for dataset configuration. + When sequence packing is enabled, this is the pack size. + Otherwise, this is the model's max_seq_length. + seed: Random seed for reproducibility. + is_embedding_model: Whether this is an embedding model (for dataset format hints). + embedding_config: Embedding model configuration (required for embedding datasets). + """ + train_file = prepared.train_file + validation_file = prepared.validation_file + + # Detect schema from training data + schema, column_keys = detect_dataset_schema( + train_file, + prompt_template=customizer_config.dataset.prompt_template, + ) + + # Validate that embedding models use embedding datasets and vice versa + if is_embedding_model and schema != DatasetSchema.EMBEDDING: + raise ValueError( + f"Model '{customizer_config.model.name}' is detected as an embedding model but the dataset " + f"is in '{schema.value}' format. Embedding models require datasets with 'query', 'pos_doc', " + "and 'neg_doc' fields. Please provide a dataset in embedding format." + ) + if schema == DatasetSchema.EMBEDDING and not is_embedding_model: + raise ValueError( + f"Dataset is in embedding format (query/pos_doc/neg_doc) but model " + f"'{customizer_config.model.name}' is not detected as an embedding model. " + "Embedding datasets can only be used with embedding models." + ) + + if schema == DatasetSchema.EMBEDDING: + # Embedding/retrieval dataset - uses inline format directly + if embedding_config is None: + raise ValueError("embedding_config is required for embedding dataset configuration") + _configure_embedding_dataset(cfg, customizer_config, train_file, validation_file, seed, embedding_config) + elif schema == DatasetSchema.CHAT: + # Chat dataset (OpenAI messages format) + _configure_chat_dataset(cfg, customizer_config, train_file, validation_file, seq_length) + else: + # SFT/Custom dataset (prompt/completion or custom columns) + assert column_keys is not None, "column_keys must be set for SFT/CUSTOM schema" + question_col, answer_col = column_keys + _configure_sft_dataset( + cfg, + customizer_config, + train_file, + validation_file, + question_col, + answer_col, + seq_length, + ) + + +def _configure_chat_dataset( + cfg: dict[str, Any], + customizer_config: TrainingStepConfig, + train_file: Path, + val_file: Path, + seq_length: int, +) -> None: + """Configure ChatDataset for OpenAI messages format.""" + logger.info(f"Configuring ChatDataset for chat format data with seq_length={seq_length}") + + # Resolve chat template using priority-based selection: + # 1. Fileset metadata chat_template (from model entity spec, highest priority) + # 2. Custom template from DEFAULT_CHAT_TEMPLATES (if model.name matches) + # 3. Model's built-in tokenizer template (fallback) + chat_template = resolve_chat_template( + model_path=customizer_config.model.path, + model_name=customizer_config.model.name, + user_template=customizer_config.model.chat_template, + ) + pp_enabled = customizer_config.parallelism.pipeline_parallel_size > 1 + # Note: "split" is required by Automodel's pack_dataset() when sequence packing is enabled. + # Without it, build_dataloader() raises AttributeError accessing cfg_ds.split. + cfg["dataset"] = { + "_target_": "nemo_automodel.components.datasets.llm.chat_dataset.ChatDataset", + "path_or_dataset_id": str(train_file), + "split": "train", + "seq_length": seq_length, + "padding": "do_not_pad" if not pp_enabled else "max_length", + } + cfg["validation_dataset"] = { + "_target_": "nemo_automodel.components.datasets.llm.chat_dataset.ChatDataset", + "path_or_dataset_id": str(val_file), + "split": "validation", + "seq_length": seq_length, + "padding": "do_not_pad" if not pp_enabled else "max_length", + } + + # Add chat template if available + if chat_template: + cfg["dataset"]["chat_template"] = chat_template + cfg["validation_dataset"]["chat_template"] = chat_template + logger.info("Added chat template to dataset config") + else: + logger.warning("No chat template found - ChatDataset may fail") + + # Store resolved template in config for checkpoint processing + # This ensures the same template is used during training and applied to output + cfg["_resolved_chat_template"] = chat_template + + +def _configure_sft_dataset( + cfg: dict[str, Any], + customizer_config: TrainingStepConfig, + train_file: Path, + val_file: Path, + question_col: str, + answer_col: str, + seq_length: int, +) -> None: + """Configure ColumnMappedTextInstructionDataset for SFT/custom format.""" + logger.info( + f"Configuring SFT dataset with columns: question={question_col}, answer={answer_col}, seq_length={seq_length}" + ) + pp_enabled = customizer_config.parallelism.pipeline_parallel_size > 1 + # Note: "split" is required by Automodel's pack_dataset() when sequence packing is enabled. + # Without it, build_dataloader() raises AttributeError accessing cfg_ds.split. + cfg["dataset"] = { + "_target_": "nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset", + "path_or_dataset_id": str(train_file), + "split": "train", + "column_mapping": { + "question": question_col, + "answer": answer_col, + }, + "seq_length": seq_length, + "answer_only_loss_mask": True, + "padding": "do_not_pad" if not pp_enabled else "max_length", + "truncation": "longest_first", + } + cfg["validation_dataset"] = { + "_target_": "nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset", + "path_or_dataset_id": str(val_file), + "split": "validation", + "column_mapping": { + "question": question_col, + "answer": answer_col, + }, + "seq_length": seq_length, + "answer_only_loss_mask": True, + "padding": "do_not_pad" if not pp_enabled else "max_length", + "truncation": "longest_first", + } + + +def _configure_embedding_dataset( + cfg: dict[str, Any], + customizer_config: TrainingStepConfig, + train_file: Path, + val_file: Path, + seed: int, + embedding_config: EmbeddingConfig, +) -> None: + """Configure embedding/retrieval dataset for biencoder training. + + Uses Automodel's inline retrieval dataset format which directly accepts + Customizer's embedding format without conversion: + {"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]} + + This uses retrieval_dataset_inline.make_retrieval_dataset which handles: + - Loading inline text directly from JSONL + - RetrievalBiencoderCollator for tokenization and batching + + Args: + cfg: Configuration dictionary to populate. + customizer_config: Training step configuration. + train_file: Path to training JSONL file. + val_file: Path to validation JSONL file. + seed: Random seed for reproducibility. + embedding_config: Embedding model configuration. + """ + + logger.info(f"Configuring embedding dataset with train_n_passages={embedding_config.train_n_passages}") + + cfg["dataloader"] = { + "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", + "dataset": { + "_target_": "nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset", + "data_dir_list": [str(train_file)], + "data_type": "train", + "train_n_passages": embedding_config.train_n_passages, + "seed": seed, + "do_shuffle": True, + }, + "collate_fn": { + "_target_": "nemo_automodel.components.datasets.llm.RetrievalBiencoderCollator", + "q_max_len": embedding_config.query_max_length, + "p_max_len": embedding_config.passage_max_length, + "query_prefix": embedding_config.query_prefix, + "passage_prefix": embedding_config.passage_prefix, + "pad_to_multiple_of": 8, + }, + "shuffle": True, + "num_workers": 0, + } + + if val_file and val_file.exists(): + cfg["validation_dataloader"] = { + "_target_": "torchdata.stateful_dataloader.StatefulDataLoader", + "dataset": { + "_target_": "nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset", + "data_dir_list": [str(val_file)], + "data_type": "eval", + "train_n_passages": embedding_config.train_n_passages, + "eval_negative_size": get_eval_negative_size(embedding_config), + "seed": seed, + "do_shuffle": False, + }, + "collate_fn": { + "_target_": "nemo_automodel.components.datasets.llm.RetrievalBiencoderCollator", + "q_max_len": embedding_config.query_max_length, + "p_max_len": embedding_config.passage_max_length, + "query_prefix": embedding_config.query_prefix, + "passage_prefix": embedding_config.passage_prefix, + "padding": "longest", + "pad_to_multiple_of": 8, + }, + "batch_size": customizer_config.batch.micro_batch_size, + "shuffle": False, + "num_workers": 0, + } + + +def _verify_tokenizer_compatibility(student_path: str, teacher_path: str, trust_remote_code: bool = False) -> None: + """ + Verify that student and teacher models have compatible tokenizers. + + Knowledge distillation requires the student and teacher to have the same + vocabulary so their logit spaces are aligned. This check prevents subtle + bugs where training appears to work but produces garbage outputs. + + Raises: + ValueError: If tokenizers are incompatible + """ + # Import here to avoid ModuleNotFoundError in environments where + # transformers is not installed (e.g., during test collection) + from transformers import AutoTokenizer + + try: + student_tokenizer = AutoTokenizer.from_pretrained(student_path, trust_remote_code=trust_remote_code) + teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_path, trust_remote_code=trust_remote_code) + + if student_tokenizer.vocab_size != teacher_tokenizer.vocab_size: + raise ValueError( + f"Tokenizer vocabulary size mismatch: student has {student_tokenizer.vocab_size} tokens, " + f"teacher has {teacher_tokenizer.vocab_size} tokens. " + "Knowledge distillation requires matching vocabularies." + ) + + # Optional: Could also check for specific token mismatches + logger.info(f"Tokenizer compatibility verified: both models have vocab_size={student_tokenizer.vocab_size}") + + except Exception as e: + if "vocabulary size mismatch" in str(e): + raise + # Log but don't fail for other tokenizer loading issues + # (e.g., network issues, missing files) - the training will fail later with a clearer error + logger.warning(f"Could not verify tokenizer compatibility: {e}") + + +def _configure_kd(cfg: dict[str, Any], customizer_config: TrainingStepConfig, trust_remote_code: bool = False) -> None: + """ + Configure Knowledge Distillation for Automodel's KD recipe. + + Automodel's KnowledgeDistillationRecipeForNextTokenPrediction requires: + - teacher_model: Frozen teacher model for soft targets + - kd_ratio: Balance between CE and KD loss (0=CE only, 1=KD only) + - kd_loss_fn: KL-divergence loss with temperature scaling + - offload_teacher_model: Optional CPU offloading for memory efficiency + """ + kd_config = customizer_config.training.kd + if not kd_config or not kd_config.teacher_model: + raise ValueError( + "Knowledge distillation requires training.kd.teacher to be set. " + "Ensure the job input includes a teacher model." + ) + + # Verify tokenizer compatibility before proceeding + _verify_tokenizer_compatibility( + customizer_config.model.path, + kd_config.teacher_model.path, + trust_remote_code=trust_remote_code, + ) + + # Teacher model (frozen, same architecture loading as student) + # Use teacher's precision if specified, otherwise fall back to student's precision + teacher_precision = kd_config.teacher_model.precision or customizer_config.model.precision + cfg["teacher_model"] = { + "_target_": "nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained", + "pretrained_model_name_or_path": kd_config.teacher_model.path, + "torch_dtype": teacher_precision.to_torch_dtype() if teacher_precision else "auto", + "attn_implementation": kd_config.teacher_model.attn_implementation, + "trust_remote_code": kd_config.teacher_model.trust_remote_code, + } + + # KD loss function with temperature + cfg["kd_loss_fn"] = { + "_target_": "nemo_automodel.components.loss.kd_loss.KDLoss", + "ignore_index": -100, + "temperature": kd_config.temperature, + "fp32_upcast": True, # Recommended for numerical stability + } + + # KD ratio (blend between CE and KD loss) + cfg["kd_ratio"] = kd_config.ratio + + # Optional: Offload teacher to CPU for memory efficiency + if kd_config.offload_teacher: + cfg["offload_teacher_model"] = True + logger.info("Teacher model will be offloaded to CPU between forward passes") + + +def get_eval_negative_size(embedding_config: EmbeddingConfig) -> int: + """Get the effective eval_negative_size value from embedding config. + + Returns the user-specified eval_negative_size if set, otherwise defaults + to train_n_passages - 1 for consistent train/eval behavior. + + The -1 relationship exists because: + - train_n_passages = total passages = 1 positive + N negatives + - eval_negative_size = just the negative count = N + - So: eval_negative_size = train_n_passages - 1 (subtracting the positive) + + Example: train_n_passages=5 (1 pos + 4 neg) -> eval_negative_size=4 + """ + if embedding_config.eval_negative_size is not None: + return embedding_config.eval_negative_size + return embedding_config.train_n_passages - 1 diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/finetune.py b/services/automodel/src/nmp/automodel/tasks/training/backends/finetune.py new file mode 100644 index 00000000..abaf469b --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/finetune.py @@ -0,0 +1,260 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Automodel training subprocess entry point. + +Wraps nemo_automodel recipes with Jobs-service progress reporting (SFT, KD, embedding). +""" + +from __future__ import annotations + +import logging +from typing import Any, Protocol, runtime_checkable + +from nemo_automodel.components.checkpoint.checkpointing import Checkpointer +from nemo_automodel.components.config._arg_parser import parse_args_and_load_config +from nemo_automodel.components.training.step_scheduler import StepScheduler +from nemo_automodel.recipes.biencoder.train_biencoder import TrainBiencoderRecipe +from nemo_automodel.recipes.llm.kd import KnowledgeDistillationRecipeForNextTokenPrediction +from nemo_automodel.recipes.llm.train_ft import TrainFinetuneRecipeForNextTokenPrediction +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.tasks.training.backends.callbacks import TrainingProgressCallback +from nmp.automodel.tasks.training.progress import JobsServiceProgressReporter + +logger = logging.getLogger(__name__) + + +@runtime_checkable +class AutomodelRecipe(Protocol): + """Protocol defining the interface we need from Automodel recipes. + + This makes the dependencies explicit and enables type checking, unlike + the previous mixin approach that relied on implicit attributes. + """ + + cfg: Any + step_scheduler: StepScheduler + checkpointer: Checkpointer + dist_env: Any + + def setup(self) -> None: + """Build all components needed for training.""" + ... + + def run_train_validation_loop(self) -> None: + """Run the main training/validation loop.""" + ... + + def log_train_metrics(self, log_data: Any) -> None: + """Log training metrics.""" + ... + + def log_val_metrics(self, *args: Any, **kwargs: Any) -> None: + """Log validation metrics. + + Note: Signature varies across Automodel recipes: + - LLM/KD: (val_name, log_data, metric_logger=None) + - VLM/biencoder/seq_cls: (log_data) + """ + ... + + def save_checkpoint( + self, + epoch: int, + step: int, + train_loss: float, + val_loss: dict[str, float] | None = None, + best_metric_key: str = "default", + ) -> None: + """Save a checkpoint.""" + ... + + +class AutomodelRecipeWrapper: + """Wraps an Automodel recipe with Jobs-service progress reporting.""" + + def __init__(self, recipe: AutomodelRecipe, job_ctx: NMPJobContext | None = None): + """Initialize the wrapper with an Automodel recipe. + + Args: + recipe: Any recipe implementing the AutomodelRecipe protocol + (SFT, KD, biencoder, etc.). + job_ctx: NeMo Platform job context for progress reporting (optional, + defaults to environment variables). + """ + self._job_ctx = job_ctx or NMPJobContext.from_env() + self._reporter = JobsServiceProgressReporter(self._job_ctx) + self._reporter.report_running("automodel_recipe_setup") + + self._recipe = recipe + self._recipe.setup() + + self.max_steps = getattr(self._recipe.step_scheduler, "max_steps", None) or 100 + self.num_epochs = getattr(self._recipe.step_scheduler, "num_epochs", None) or 1 + + self.callback = TrainingProgressCallback(self._reporter) + logger.info(f"Automodel recipe wrapper initialized: max_steps={self.max_steps}, num_epochs={self.num_epochs}") + + # Store original methods before patching + self._original_log_train_metrics = recipe.log_train_metrics + self._original_log_val_metrics = recipe.log_val_metrics + self._original_save_checkpoint = recipe.save_checkpoint + + # Monkey-patch the recipe's methods to add our callbacks + recipe.log_train_metrics = self._log_train_metrics # type: ignore[method-assign] + recipe.log_val_metrics = self._log_val_metrics # type: ignore[method-assign] + recipe.save_checkpoint = self._save_checkpoint # type: ignore[method-assign] + + @property + def recipe(self) -> AutomodelRecipe: + """Access the underlying recipe.""" + return self._recipe + + def run_train_validation_loop(self) -> None: + """Run training and close the progress callback.""" + try: + self.callback.report_training_start(self.max_steps, self.num_epochs) + self._recipe.run_train_validation_loop() + finally: + if self.callback: + self.callback.close() + logger.info("Training progress callback closed") + + def _log_train_metrics(self, log_data: Any) -> None: + """Wrapped log_train_metrics with Jobs-service reporting.""" + self._original_log_train_metrics(log_data) + if self.callback and log_data: + try: + metrics = getattr(log_data, "metrics", {}) + self.callback.report_train_step( + step=getattr(log_data, "step", 0) + 1, # Convert to 1-based + epoch=getattr(log_data, "epoch", 0) + 1, # Convert to 1-based + loss=metrics.get("loss", 0.0), + lr=metrics.get("lr"), + grad_norm=metrics.get("grad_norm"), + ) + except Exception as e: + logger.warning(f"Failed to report training progress: {e}") + + try: + if self._recipe.step_scheduler.is_last_batch: + self.callback.report_epoch_end( + step=self._recipe.step_scheduler.step + 1, + epoch=self._recipe.step_scheduler.epoch + 1, + ) + except Exception as e: + logger.warning(f"Failed to report epoch end: {e}") + + def _log_val_metrics(self, *args: Any, **kwargs: Any) -> None: + """Wrapped log_val_metrics with Jobs-service reporting. + + Handles different Automodel recipe signatures: + - LLM/KD: (val_name, log_data, metric_logger=None) + - VLM/biencoder/seq_cls: (log_data) + """ + # Call original method first with whatever args were passed + self._original_log_val_metrics(*args, **kwargs) + + # Extract log_data from args (it's always the last positional arg before kwargs) + # LLM signature: (val_name, log_data, metric_logger=None) -> log_data is args[1] + # VLM/biencoder signature: (log_data) -> log_data is args[0] + log_data = None + if len(args) >= 2: + # LLM/KD style: (val_name, log_data, ...) + log_data = args[1] + elif len(args) == 1: + # VLM/biencoder style: (log_data) + log_data = args[0] + + if self.callback and log_data: + try: + metrics = getattr(log_data, "metrics", {}) + self.callback.report_validation( + step=getattr(log_data, "step", 0) + 1, # Convert to 1-based + epoch=getattr(log_data, "epoch", 0) + 1, # Convert to 1-based + val_loss=metrics.get("val_loss", 0.0), + ) + except Exception as e: + logger.warning(f"Failed to report validation progress: {e}") + + def _save_checkpoint( + self, + epoch: int, + step: int, + train_loss: float, + val_loss: dict[str, float] | None = None, + best_metric_key: str = "default", + ) -> None: + """Wrapped save_checkpoint with Jobs-service reporting.""" + self._original_save_checkpoint(epoch, step, train_loss, val_loss, best_metric_key) + if self.callback: + try: + checkpoint_dir = getattr( + getattr(self._recipe.checkpointer, "config", None), + "checkpoint_dir", + None, + ) + self.callback.report_checkpoint_saved( + step=step + 1, # Convert to 1-based + epoch=epoch + 1, # Convert to 1-based + checkpoint_path=str(checkpoint_dir) if checkpoint_dir else None, + ) + except Exception as e: + logger.warning(f"Failed to report checkpoint save: {e}") + + +def _is_kd_config(cfg: Any) -> bool: + """Check if config is for knowledge distillation.""" + return cfg.get("teacher_model") is not None or cfg.get("kd_ratio") is not None + + +def _is_biencoder_config(cfg: Any) -> bool: + """Check if config is for biencoder/embedding model training. + + Detects biencoder configs by checking if model._target_ contains 'biencoder'. + + Note: ConfigNode automatically resolves _target_ to the actual function/class, + so we check the function's __module__ or __qualname__ for 'biencoder'. + """ + try: + model_cfg = cfg.get("model", {}) + if model_cfg is None: + return False + + target = model_cfg.get("_target_") + if target is None: + return False + + # target is resolved to the actual function/class by ConfigNode + # Check its module path or qualified name + module = getattr(target, "__module__", "") or "" + qualname = getattr(target, "__qualname__", "") or "" + return "biencoder" in module.lower() or "biencoder" in qualname.lower() + except (AttributeError, TypeError): + return False + + +def create_automodel_recipe(cfg: Any) -> AutomodelRecipeWrapper: + """Create a progress-reporting wrapper for the recipe implied by *cfg*.""" + if _is_biencoder_config(cfg): + logger.info("Detected biencoder config, using embedding model recipe") + base_recipe = TrainBiencoderRecipe(cfg) + elif _is_kd_config(cfg): + logger.info("Detected Knowledge Distillation config, using KD recipe") + base_recipe = KnowledgeDistillationRecipeForNextTokenPrediction(cfg) + else: + logger.info("Using SFT fine-tuning recipe") + base_recipe = TrainFinetuneRecipeForNextTokenPrediction(cfg) + + return AutomodelRecipeWrapper(base_recipe) + + +def main() -> None: + cfg = parse_args_and_load_config() + recipe = create_automodel_recipe(cfg) + recipe.run_train_validation_loop() + + +if __name__ == "__main__": + main() diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/requirements.txt b/services/automodel/src/nmp/automodel/tasks/training/backends/requirements.txt new file mode 100644 index 00000000..1ac43ef3 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/requirements.txt @@ -0,0 +1 @@ +nemo_automodel==0.2.0 diff --git a/services/automodel/src/nmp/automodel/tasks/training/chat_templates.py b/services/automodel/src/nmp/automodel/tasks/training/chat_templates.py new file mode 100644 index 00000000..854823e1 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/chat_templates.py @@ -0,0 +1,196 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Chat template resolution and application for training backends. + +This module provides: +1. Priority-based chat template selection (resolve_chat_template) +2. Applying chat templates to output checkpoints (apply_chat_template_to_checkpoint) + +Chat template priority order: +1. User-provided template (via API) +2. Custom template from DEFAULT_CHAT_TEMPLATES map (enhanced for tool calling) +3. Model's built-in tokenizer template (fallback) + +The custom templates in the templates/ directory extend base model templates with: +- Tool calling support: , , formatting +- Generation markers: {% generation %}...{% endgeneration %} blocks for loss masking +- Enhanced compatibility across models +""" + +import json +import logging +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Directory containing custom chat template jinja files +TEMPLATES_DIR = Path(__file__).parent / "templates" + +# ============================================================================ +# Model Name Constants +# ============================================================================ + +# Meta Llama models +META_LLAMA_31_8B_INSTRUCT = "meta/llama-3.1-8b-instruct" +META_LLAMA_31_70B_INSTRUCT = "meta/llama-3.1-70b-instruct" +META_LLAMA_31_405B_INSTRUCT = "meta/llama-3.1-405b-instruct" +META_LLAMA_32_1B = "meta/llama-3.2-1b" +META_LLAMA_32_1B_INSTRUCT = "meta/llama-3.2-1b-instruct" +META_LLAMA_32_3B_INSTRUCT = "meta/llama-3.2-3b-instruct" +META_LLAMA_33_70B_INSTRUCT = "meta/llama-3.3-70b-instruct" +# NVIDIA Nemotron models +NVIDIA_NEMOTRON_31_8B = "nvidia/nemotron-nano-llama-3.1-8b" +NVIDIA_NEMOTRON_31_70B = "nvidia/nemotron-llama-3.1-70b" +NVIDIA_NEMOTRON_33_49B = "nvidia/nemotron-super-llama-3.3-49b" +NVIDIA_NEMOTRON_33_49B_V1_5 = "nvidia/nemotron-super-llama-3.3-49b-v1.5" +# NIM model names (alternative naming) +NIM_NVIDIA_NEMOTRON_31_8B = "nvidia/llama-3.1-nemotron-nano-8b-v1" +NIM_NVIDIA_NEMOTRON_31_70B = "nvidia/llama-3.1-nemotron-70b-instruct" +NIM_NVIDIA_NEMOTRON_33_49B = "nvidia/llama-3.3-nemotron-super-49b-v1" +NIM_NVIDIA_NEMOTRON_33_49B_V1_5 = "nvidia/llama-3.3-nemotron-super-49b-v1.5" +# Microsoft models +PHI_4 = "microsoft/phi-4" + +# ============================================================================ +# Default Chat Templates Map +# ============================================================================ + +# Maps model names to custom jinja template filenames. +# These templates extend the base model templates with: +# - Tool calling support +# - Generation markers for loss masking +# - Enhanced compatibility +DEFAULT_CHAT_TEMPLATES: dict[str, str] = { + # Llama 3.1 family + META_LLAMA_31_8B_INSTRUCT: "llama-3.1-instruct.jinja", + META_LLAMA_31_70B_INSTRUCT: "llama-3.1-instruct.jinja", + META_LLAMA_31_405B_INSTRUCT: "llama-3.1-instruct.jinja", + # Llama 3.2 family + META_LLAMA_32_1B: "llama-3.2-instruct.jinja", + META_LLAMA_32_1B_INSTRUCT: "llama-3.2-instruct.jinja", + META_LLAMA_32_3B_INSTRUCT: "llama-3.2-instruct.jinja", + # Llama 3.3 family + META_LLAMA_33_70B_INSTRUCT: "llama-3.3-instruct.jinja", + # Nemotron family + NVIDIA_NEMOTRON_31_8B: "nemotron-3.1.jinja", + NVIDIA_NEMOTRON_31_70B: "nemotron-3.1.jinja", + NVIDIA_NEMOTRON_33_49B: "nemotron-super-3.3.jinja", + NVIDIA_NEMOTRON_33_49B_V1_5: "nemotron-super-3.3.jinja", + # NIM Nemotron (alternative naming) + NIM_NVIDIA_NEMOTRON_31_8B: "nemotron-3.1.jinja", + NIM_NVIDIA_NEMOTRON_31_70B: "nemotron-3.1.jinja", + NIM_NVIDIA_NEMOTRON_33_49B: "nemotron-super-3.3.jinja", + NIM_NVIDIA_NEMOTRON_33_49B_V1_5: "nemotron-super-3.3.jinja", + # Microsoft + PHI_4: "phi-4.jinja", +} + + +def _load_template_file(template_filename: str) -> str | None: + """Load a custom template from the templates directory.""" + template_path = TEMPLATES_DIR / template_filename + if template_path.exists(): + with open(template_path, "r", encoding="utf-8") as f: + return f.read() + logger.warning(f"Template file not found: {template_path}") + return None + + +def _get_tokenizer_chat_template(model_path: str) -> str | None: + """ + Get chat template from model's tokenizer. + + Uses AutoTokenizer which handles all model formats (HF, NeMo, custom). + """ + try: + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + template = getattr(tokenizer, "chat_template", None) + if template: + logger.debug(f"Found chat template in tokenizer for {model_path}") + return template + except Exception as e: + logger.warning(f"Could not load tokenizer to get chat template: {e}") + return None + + +def resolve_chat_template( + model_path: str, + model_name: str | None = None, + user_template: str | None = None, +) -> str | None: + """ + Resolve chat template using priority-based selection. + + Priority order: + 1. User-provided template (highest priority) + 2. Custom template from DEFAULT_CHAT_TEMPLATES (if model_name matches) + 3. Model's built-in tokenizer template (fallback) + + Args: + model_path: Path to the model directory (for tokenizer fallback). + model_name: Canonical model name (e.g., "meta/llama-3.1-8b-instruct"). + Used to look up custom templates. + user_template: User-provided template string (takes highest priority). + + Returns: + The resolved chat template string, or None if no template found. + """ + # Priority 1: User-provided template + if user_template: + logger.info("Using user-provided chat template") + return user_template + + # Priority 2: Custom template from DEFAULT_CHAT_TEMPLATES + if model_name and model_name in DEFAULT_CHAT_TEMPLATES: + template_filename = DEFAULT_CHAT_TEMPLATES[model_name] + template = _load_template_file(template_filename) + if template: + logger.info(f"Using custom chat template for {model_name}: {template_filename}") + return template + + # Priority 3: Model's built-in tokenizer template + template = _get_tokenizer_chat_template(model_path) + if template: + logger.info(f"Using model's built-in chat template from {model_path}") + return template + + logger.warning(f"No chat template found for model_name={model_name}, model_path={model_path}") + return None + + +def apply_chat_template_to_checkpoint( + output_path: Path, + chat_template: str | None, +) -> None: + """ + Apply chat template to the output checkpoint's tokenizer_config.json. + + Also ensures pad_token is set if missing (uses eos_token as fallback), + which is required by many inference frameworks. + + Args: + output_path: Path to the checkpoint directory containing tokenizer_config.json. + chat_template: The chat template string to apply. If None, skips application. + """ + if not chat_template: + logger.warning("No chat template provided, skipping") + return + + tokenizer_config = output_path / "tokenizer_config.json" + if not tokenizer_config.exists(): + logger.warning(f"tokenizer_config.json not found at {output_path}") + return + + with open(tokenizer_config, "r") as f: + config = json.load(f) + + config["chat_template"] = chat_template + + with open(tokenizer_config, "w") as f: + json.dump(config, f, indent=2) + + logger.info("Applied chat template to output checkpoint") diff --git a/services/automodel/src/nmp/automodel/tasks/training/datasets/preparation.py b/services/automodel/src/nmp/automodel/tasks/training/datasets/preparation.py new file mode 100644 index 00000000..63473c5d --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/datasets/preparation.py @@ -0,0 +1,509 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. + +"""Dataset discovery, merge/split, and schedule helpers for Automodel training.""" + +import json +import logging +import random +import re +import shutil +import subprocess +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any, Optional, Union + +from nmp.automodel.app.constants import DEFAULT_SEED + +logger = logging.getLogger(__name__) + +# Dataset directory constants for merged files (we control this structure) +MERGED_DIR = "merged" +TRAIN_FILE = "train.jsonl" +VAL_FILE = "validation.jsonl" + +# Heuristic patterns for discovering training files +TRAIN_PATTERNS = [ + "train*.jsonl", + "training*.jsonl", + "train*.json", + "training*.json", +] +TRAIN_DIRS = ["train", "training"] + +# Heuristic patterns for discovering validation files +VAL_PATTERNS = [ + "val*.jsonl", + "validation*.jsonl", + "val*.json", + "validation*.json", + "dev*.jsonl", + "dev*.json", +] +VAL_DIRS = ["val", "validation", "dev"] + + +class DatasetSchema(str, Enum): + """Detected dataset schema type.""" + + CHAT = "chat" # OpenAI messages format: {"messages": [...]} + SFT = "sft" # Prompt/completion: {"prompt": ..., "completion": ...} + CUSTOM = "custom" # Custom columns via prompt_template + EMBEDDING = "embedding" # Retrieval format: {"query": ..., "pos_doc": ..., "neg_doc": [...]} + + +class DatasetFormatError(Exception): + """Raised when dataset format is invalid or unsupported.""" + + pass + + +def detect_dataset_schema( + file_path: Path, + prompt_template: str | None = None, +) -> tuple[DatasetSchema, tuple[str, ...] | None]: + """ + Detect dataset schema by sampling the first line. + + Supports four formats: + 1. Chat format: {"messages": [{"role": "user", ...}, {"role": "assistant", ...}]} + 2. Embedding format: {"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]} + 3. SFT format: {"prompt": "...", "completion": "..."} + 4. Custom format: Any two-column format specified via prompt_template like "{input} {output}" + + Args: + file_path: Path to the JSONL dataset file. + prompt_template: Optional template string with two placeholders like "{input} {output}". + + Returns: + Tuple of (schema_type, column_keys) where: + - CHAT: column_keys is None + - EMBEDDING: column_keys is ("query", "pos_doc", "neg_doc") + - SFT/CUSTOM: column_keys is (question_col, answer_col) + + Raises: + DatasetFormatError: If the dataset format cannot be detected or is invalid. + """ + with open(file_path, "r", encoding="utf-8") as f: + line = f.readline() + + try: + obj: dict[str, Any] = json.loads(line) + except json.JSONDecodeError as e: + raise DatasetFormatError(f"Invalid JSON in {file_path}: {e}") + + # Check for chat format (OpenAI messages) + if "messages" in obj and isinstance(obj["messages"], list): + if len(obj["messages"]) > 0 and isinstance(obj["messages"][0], dict): + if "role" in obj["messages"][0]: + logger.info(f"Detected chat dataset format in {file_path}") + return DatasetSchema.CHAT, None + + # Check for embedding/retrieval format + # Format: {"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]} + if "query" in obj and "pos_doc" in obj and "neg_doc" in obj: + if isinstance(obj["query"], str) and isinstance(obj["pos_doc"], str) and isinstance(obj["neg_doc"], list): + logger.info(f"Detected embedding/retrieval dataset format in {file_path}") + return DatasetSchema.EMBEDDING, ("query", "pos_doc", "neg_doc") + + # Check for custom prompt_template format + if prompt_template: + keys = re.findall(r"\{(.*?)\}", prompt_template) + if len(keys) == 2: + # Validate keys exist in data + if all(k in obj for k in keys): + logger.info(f"Detected custom template format with keys {keys}") + return DatasetSchema.CUSTOM, (keys[0], keys[1]) + else: + raise DatasetFormatError( + f"prompt_template keys {keys} not found in dataset. Available keys: {list(obj.keys())}" + ) + else: + raise DatasetFormatError(f"prompt_template must have exactly 2 placeholders, got: {prompt_template}") + + # Check for standard SFT format (prompt/completion) + if "prompt" in obj and "completion" in obj: + logger.info(f"Detected SFT (prompt/completion) format in {file_path}") + return DatasetSchema.SFT, ("prompt", "completion") + + # Fallback - try to find any two string columns + string_cols = [k for k, v in obj.items() if isinstance(v, str)] + if len(string_cols) >= 2: + logger.warning(f"Could not detect standard format, using first two string columns: {string_cols[:2]}") + return DatasetSchema.SFT, (string_cols[0], string_cols[1]) + + raise DatasetFormatError( + f"Could not detect dataset format. Expected 'messages' (chat) or " + f"'prompt'/'completion' (SFT) columns. Found: {list(obj.keys())}" + ) + + +def _count_jsonl_samples_python(file_path: Path) -> int: + """Pure Python implementation of line counting (fallback).""" + count = 0 + with open(file_path, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): # Non-empty line + count += 1 + return count + + +def count_jsonl_samples(file_path: Path) -> int: + """ + Count the number of non-empty lines in a JSONL file. + + Uses grep for efficiency with large files when available, + falls back to pure Python implementation otherwise. + + Args: + file_path: Path to the JSONL file. + + Returns: + Number of non-empty lines (samples) in the file. + """ + # Check if grep is available + if shutil.which("grep") is None: + return _count_jsonl_samples_python(file_path) + + try: + # Use `grep -c "\S"` to count non-empty lines (excludes trailing empty lines) + result = subprocess.check_output(["grep", "-c", r"\S", str(file_path)], text=True) + return int(result.strip()) + except subprocess.CalledProcessError: + # grep returns exit code 1 if no matches (empty file) + return 0 + except OSError: + # Fallback if subprocess fails for any reason + return _count_jsonl_samples_python(file_path) + + +def compute_val_check_interval( + steps_per_epoch: int, + max_steps: int, + val_check_interval: Optional[Union[int, float]] = None, +) -> int: + """ + Compute how often to run validation (in steps). + + This handles the semantic difference between: + - float <= 1.0: Fraction of epoch (e.g., 0.5 = validate at 50% of each epoch) + - int or float > 1.0: Absolute step count + + Args: + steps_per_epoch: Number of gradient steps per epoch. + max_steps: Maximum training steps. + val_check_interval: User-provided interval (float for fraction, int for steps). + + Returns: + Integer step count for validation interval. + + Raises: + ValueError: If val_check_interval is negative. + """ + effective_steps = min(steps_per_epoch, max_steps) + + if val_check_interval is None or val_check_interval == 0: + # Default: validate once per epoch (or at end if max_steps < steps_per_epoch) + return effective_steps + + if val_check_interval < 0: + raise ValueError("val_check_interval cannot be negative") + + # Float <= 1.0: interpret as fraction of epoch + if isinstance(val_check_interval, float) and val_check_interval <= 1.0: + interval = max(1, int(val_check_interval * steps_per_epoch)) + else: + # Integer or float > 1.0: treat as absolute step count + interval = int(val_check_interval) + + # Cap at effective_steps + interval = min(interval, effective_steps) + + # Ensure validation happens at least once before training ends + if interval >= max_steps: + interval = max(1, max_steps - 1) + + return interval + + +@dataclass +class PreparedDataset: + """Result of dataset preparation.""" + + merged_dir: Path + train_file: Path + validation_file: Path + train_samples: int + validation_samples: int + + +def _discover_files_by_patterns(base_path: Path, patterns: list[str], dirs: list[str]) -> list[Path]: + """ + Discover files matching patterns or in specific directories. + + Searches for: + 1. Files matching glob patterns in base_path + 2. All .jsonl/.json files in specified subdirectories + + Args: + base_path: Root directory to search. + patterns: Glob patterns to match (e.g., ["train*.jsonl"]). + dirs: Subdirectory names to search (e.g., ["train", "training"]). + + Returns: + Sorted list of discovered file paths. + """ + files: set[Path] = set() + + # Pattern matching in base directory + for pattern in patterns: + for match in base_path.glob(pattern): + if match.is_file(): + files.add(match.resolve()) + + # Files in subdirectories + for dir_name in dirs: + subdir = base_path / dir_name + if subdir.is_dir(): + for f in subdir.iterdir(): + if f.is_file() and f.suffix.lower() in (".jsonl", ".json"): + files.add(f.resolve()) + + return sorted(files) # Sorted for deterministic ordering + + +def discover_dataset_files(dataset_path: Path) -> tuple[list[Path], list[Path]]: + """ + Discover training and validation files using heuristics. + + Heuristics applied (in order): + 1. Files matching train*/training* patterns → training + 2. Files in train/ or training/ directories → training + 3. Files matching val*/validation*/dev* patterns → validation + 4. Files in val/, validation/, or dev/ directories → validation + 5. If only one .jsonl file found → treat as training (will auto-split) + + Args: + dataset_path: Path to the dataset directory. + + Returns: + Tuple of (training_files, validation_files). + + Raises: + DatasetFormatError: If no training files can be found. + """ + dataset_path = Path(dataset_path).resolve() + + if not dataset_path.exists(): + raise DatasetFormatError(f"Dataset path does not exist: {dataset_path}") + + # If path is a file, treat it as the training file + if dataset_path.is_file(): + logger.info(f"Dataset path is a file, treating as training data: {dataset_path}") + return [dataset_path], [] + + # Discover training files + train_files = _discover_files_by_patterns(dataset_path, TRAIN_PATTERNS, TRAIN_DIRS) + + # Discover validation files + val_files = _discover_files_by_patterns(dataset_path, VAL_PATTERNS, VAL_DIRS) + + # Fallback: if no files found with patterns, check for any .jsonl files + if not train_files and not val_files: + all_jsonl = sorted(f for f in dataset_path.glob("*.jsonl") if f.is_file()) + if len(all_jsonl) == 1: + logger.info(f"Found single JSONL file, treating as training data: {all_jsonl[0]}") + train_files = all_jsonl + elif len(all_jsonl) > 1: + # Ambiguous - could be train/val or multiple training files + logger.warning( + f"Found {len(all_jsonl)} JSONL files without clear train/val naming. " + f"Treating all as training data: {[f.name for f in all_jsonl]}" + ) + train_files = all_jsonl + + if not train_files: + raise DatasetFormatError( + f"No training files found in {dataset_path}. " + f"Expected files matching patterns like train*.jsonl or a train/ directory." + ) + + logger.info(f"Discovered {len(train_files)} training file(s): {[f.name for f in train_files]}") + if val_files: + logger.info(f"Discovered {len(val_files)} validation file(s): {[f.name for f in val_files]}") + else: + logger.info("No validation files found - will auto-split from training data") + + return train_files, val_files + + +def _merge_files(files: list[Path], output_file: Path) -> int: + """ + Merge multiple JSONL files into a single file. + + Args: + files: List of files to merge. + output_file: Output file path. + + Returns: + Total number of samples (non-empty lines) in merged file. + """ + output_file.parent.mkdir(parents=True, exist_ok=True) + + with open(output_file, "w", encoding="utf-8") as out: + for f in files: + with open(f, "r", encoding="utf-8") as inp: + content = inp.read() + out.write(content) + # Ensure newline between files + if content and not content.endswith("\n"): + out.write("\n") + + return count_jsonl_samples(output_file) + + +def _create_val_split( + train_file: Path, + output_train: Path, + output_val: Path, + val_ratio: float = 0.1, + seed: int = DEFAULT_SEED, +) -> tuple[int, int]: + """ + Split a training file into train and validation sets. + + Args: + train_file: Source training file. + output_train: Output path for training split. + output_val: Output path for validation split. + val_ratio: Fraction of data to use for validation (default: 10%). + seed: Random seed for reproducible splits (default: 1111). + + Returns: + Tuple of (train_samples, validation_samples). + """ + with open(train_file, "r", encoding="utf-8") as f: + lines = [line for line in f if line.strip()] + + # Shuffle for reproducibility (important for multi-node!) + # Uses global seed if not explicitly provided + random.seed(seed) + random.shuffle(lines) + + val_size = max(1, int(len(lines) * val_ratio)) + val_lines = lines[:val_size] + train_lines = lines[val_size:] + + output_train.parent.mkdir(parents=True, exist_ok=True) + output_val.parent.mkdir(parents=True, exist_ok=True) + + with open(output_train, "w", encoding="utf-8") as f: + for line in train_lines: + # Re-serialize to ensure valid JSON and consistent formatting + f.write(json.dumps(json.loads(line)) + "\n") + + with open(output_val, "w", encoding="utf-8") as f: + for line in val_lines: + f.write(json.dumps(json.loads(line)) + "\n") + + logger.info( + f"Created validation split: {len(train_lines)} train samples, {len(val_lines)} val samples " + f"({val_ratio:.0%} split)" + ) + + return len(train_lines), len(val_lines) + + +def prepare_dataset( + dataset_path: Path, + output_dir: Optional[Path] = None, + val_split_ratio: float = 0.1, + seed: int = DEFAULT_SEED, +) -> PreparedDataset: + """ + Prepare dataset for training by discovering, merging, and optionally splitting files. + + This function: + 1. Discovers training and validation files using heuristics + 2. Merges multiple files into single train.jsonl and val.jsonl + 3. Auto-creates validation split if no validation files found + 4. Returns paths to the prepared files + + Args: + dataset_path: Path to the dataset directory or file. + output_dir: Directory for merged output (default: dataset_path/merged). + val_split_ratio: Fraction for auto-split if no validation data (default: 0.1). + seed: Random seed for reproducible validation splits (default: 1111). + + Returns: + PreparedDataset with paths to merged files and sample counts. + + Raises: + DatasetFormatError: If dataset cannot be prepared. + """ + dataset_path = Path(dataset_path).resolve() + + # Determine output directory + if output_dir is None: + if dataset_path.is_file(): + merged_dir = dataset_path.parent / MERGED_DIR + else: + merged_dir = dataset_path / MERGED_DIR + else: + merged_dir = Path(output_dir).resolve() + + train_output = merged_dir / TRAIN_FILE + validation_output = merged_dir / VAL_FILE + + # Discover files + train_files, val_files = discover_dataset_files(dataset_path) + + # Merge training files + if len(train_files) == 1 and not val_files: + # Single file, no validation - need to split + logger.info("Single training file with no validation data - creating split") + train_samples, validation_samples = _create_val_split( + train_files[0], + train_output, + validation_output, + val_ratio=val_split_ratio, + seed=seed, + ) + else: + # Merge training files + train_samples = _merge_files(train_files, train_output) + logger.info(f"Merged {len(train_files)} training file(s) → {train_output} ({train_samples} samples)") + + if val_files: + # Merge validation files + validation_samples = _merge_files(val_files, validation_output) + logger.info( + f"Merged {len(val_files)} validation file(s) → {validation_output} ({validation_samples} samples)" + ) + else: + # Auto-split from merged training file + logger.info("No validation files - creating split from merged training data") + # Read merged, split, re-write + train_samples, validation_samples = _create_val_split( + train_output, + train_output, + validation_output, + val_ratio=val_split_ratio, + seed=seed, + ) + + return PreparedDataset( + merged_dir=merged_dir, + train_file=train_output, + validation_file=validation_output, + train_samples=train_samples, + validation_samples=validation_samples, + ) diff --git a/services/automodel/src/nmp/automodel/tasks/training/datasets/schemas.py b/services/automodel/src/nmp/automodel/tasks/training/datasets/schemas.py new file mode 100644 index 00000000..487c0151 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/datasets/schemas.py @@ -0,0 +1,430 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. + +# ============================================================================= +# Dataset Schemas for DPO Training +# ============================================================================= +# Preference Dataset Schemas for DPO Training: +# - PreferenceDataset: Native format with context + ranked completions +# - BinaryPreferenceDataset: Simple prompt/chosen/rejected strings +# - HelpSteer3Dataset: NVIDIA HelpSteer3 format with preference scores +# - Tulu3PreferenceDataset: AllenAI Tulu3 format with message lists +# +# SFT Dataset Schemas: +# - SFTDatasetItemSchema: Standard prompt/completion format +from typing import Annotated, Any, List, Literal, Optional, Union + +from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag, model_validator + +# Dataset class names from nmp.automodel.tasks.training.backends.nemo_rl.preference_datasets +# These constants ensure consistency between the discriminator and Tag values +PREFERENCE_DATASET = "PreferenceDataset" +BINARY_PREFERENCE_DATASET = "BinaryPreferenceDataset" +HELPSTEER3_DATASET = "HelpSteer3" +TULU3_PREFERENCE_DATASET = "Tulu3Preference" + + +class ChatMessage(BaseModel): + """A single message in a conversation.""" + + role: str = Field(..., description="The role of the message sender (e.g., 'user', 'assistant', 'system')") + content: str = Field(..., description="The content of the message") + + +class CompletionItem(BaseModel): + """A ranked completion in a preference dataset.""" + + rank: int = Field(..., description="Rank of this completion (0 = best/chosen, higher = worse)") + completion: List[ChatMessage] = Field(..., description="The completion as a list of messages") + + +class PreferenceDatasetItemSchema(BaseModel): + """Schema for native PreferenceDataset format. + + This is the canonical format used by nemo-rl's PreferenceDataset class. + It supports multi-turn context and multiple ranked completions. + + Example: + { + "context": [{"role": "user", "content": "What is 2+2?"}], + "completions": [ + {"rank": 0, "completion": [{"role": "assistant", "content": "4"}]}, + {"rank": 1, "completion": [{"role": "assistant", "content": "5"}]} + ] + } + """ + + context: List[ChatMessage] = Field( + ..., description="The conversation context (prompt messages including previous turns)" + ) + completions: List[CompletionItem] = Field( + ..., description="List of ranked completions (rank 0 = preferred, rank 1 = rejected, etc.)" + ) + + model_config = ConfigDict(extra="allow") + + +class BinaryPreferenceDatasetItemSchema(BaseModel): + """Schema for BinaryPreferenceDataset format. + + Simple format with prompt, chosen response, and rejected response as strings. + The prompt can be either a string or a list of messages. + + Example: + { + "prompt": "What is the capital of France?", + "chosen": "The capital of France is Paris.", + "rejected": "The capital of France is London." + } + """ + + prompt: Union[str, List[ChatMessage]] = Field(..., description="The input prompt (string or list of messages)") + chosen: str = Field(..., description="The preferred/chosen response") + rejected: str = Field(..., description="The rejected/non-preferred response") + + model_config = ConfigDict(extra="allow") + + +class HelpSteer3DatasetItemSchema(BaseModel): + """Schema for NVIDIA HelpSteer3 preference dataset format. + + Uses numeric preference scores to indicate which response is preferred. + - Negative overall_preference: response1 is preferred + - Positive overall_preference: response2 is preferred + - Zero overall_preference: tie (no preference) + + Example: + { + "context": "Explain quantum computing", + "response1": "Quantum computing uses qubits...", + "response2": "Quantum computing is magic...", + "overall_preference": -2 + } + """ + + context: Union[str, List[ChatMessage]] = Field(..., description="The input context (string or list of messages)") + response1: str = Field(..., description="First response option") + response2: str = Field(..., description="Second response option") + overall_preference: int = Field( + ..., + description="Preference score: negative=response1 preferred, positive=response2 preferred, 0=tie", + ) + + model_config = ConfigDict(extra="allow") + + +class Tulu3PreferenceDatasetItemSchema(BaseModel): + """Schema for AllenAI Tulu3 preference dataset format. + + Contains full conversation histories for both chosen and rejected responses. + The last message in each list must be from the assistant role. + + Example: + { + "chosen": [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi! How can I help?"} + ], + "rejected": [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Go away."} + ] + } + """ + + chosen: List[ChatMessage] = Field( + ..., description="Full conversation with preferred response (last message must be assistant)" + ) + rejected: List[ChatMessage] = Field( + ..., description="Full conversation with rejected response (last message must be assistant)" + ) + + model_config = ConfigDict(extra="allow") + + +def get_preference_dataset_discriminator(v: Any) -> str: + """Determine the preference dataset schema type based on field presence. + + This discriminator function examines the fields present in the data + to determine which schema type it matches. Returns the NeMo RL dataset + class name that corresponds to the detected format: + - PreferenceDataset: Has 'context' and 'completions' fields (native format) + - HelpSteer3: Has 'overall_preference' field (HelpSteer3 format) + - Tulu3PreferenceDataset: Has 'chosen' and 'rejected' as lists of messages + - BinaryPreferenceDataset: Has 'prompt', 'chosen', 'rejected' + + Args: + v: The data to discriminate (dict or model instance) + + Returns: + NeMo RL dataset class name identifying the schema type + """ + if isinstance(v, dict): + # Native PreferenceDataset format: context + completions + if "completions" in v and "context" in v: + return PREFERENCE_DATASET + + # HelpSteer3 format: has overall_preference score + if "overall_preference" in v: + return HELPSTEER3_DATASET + + # Tulu3 format: chosen/rejected are lists of messages (must check BEFORE BinaryPreferenceDataset) + # Tulu3 data may also have 'prompt' field, so we differentiate by checking if chosen/rejected are lists + if "chosen" in v and "rejected" in v: + chosen = v.get("chosen") + if isinstance(chosen, list) and len(chosen) > 0: + # Check if it looks like a message list + if isinstance(chosen[0], dict) and "role" in chosen[0]: + return TULU3_PREFERENCE_DATASET + + # BinaryPreferenceDataset format: prompt + chosen + rejected (as strings) + if "prompt" in v and "chosen" in v and "rejected" in v: + return BINARY_PREFERENCE_DATASET + + return PREFERENCE_DATASET # Default fallback + + +# Union type for all preference dataset formats +DPOPreferenceDatasetSchemaType = Annotated[ + Union[ + Annotated[PreferenceDatasetItemSchema, Tag(PREFERENCE_DATASET)], + Annotated[BinaryPreferenceDatasetItemSchema, Tag(BINARY_PREFERENCE_DATASET)], + Annotated[HelpSteer3DatasetItemSchema, Tag(HELPSTEER3_DATASET)], + Annotated[Tulu3PreferenceDatasetItemSchema, Tag(TULU3_PREFERENCE_DATASET)], + ], + Discriminator(get_preference_dataset_discriminator), +] + + +# ============================================================================= +# SFT Dataset Schemas +# ============================================================================= +class SFTPromptTemplateDatasetItemSchema(BaseModel): + """Schema for standard SFT (Supervised Fine-Tuning) dataset format. + + The standard format has prompt and completion fields, but allows additional + fields for custom templates (e.g., {input}, {output}, {instruction}, etc.). + + Example (standard format): + { + "prompt": "What is the capital of France?", + "completion": "The capital of France is Paris." + } + + Example (custom template format): + { + "instruction": "Answer the question", + "input": "What is the capital of France?", + "output": "The capital of France is Paris." + } + """ + + model_config = ConfigDict(extra="allow") + + # Make all fields optional so custom templates can use any field names + prompt: Optional[str] = Field(None, description="The input prompt (standard format)") + completion: Optional[str] = Field(None, description="The expected completion/output (standard format)") + + +class FunctionCallDetails(BaseModel): + """Details of a function call made by a tool call. + + Example: + { + "name": "get_weather", + "arguments": {"location": "San Francisco"} + } + """ + + name: str = Field(..., description="The name of the function to call") + arguments: dict[str, Any] = Field(..., description="The arguments to pass to the function") + content_type: Optional[str] = Field(None, description="Optional content type of the function response") + + +class ToolCall(BaseModel): + """A tool call in a message.""" + + type: Literal["function"] = Field(..., description="The type of tool call (must be 'function')") + function: FunctionCallDetails = Field(..., description="Function call details including name and arguments") + + +class SFTChatMessage(BaseModel): + """A single message in an SFT chat conversation. + + Each message must have a role and at least one of: content, thinking, or tool_calls. + + Important: content and thinking are mutually exclusive within a single message. + If both are needed, they should be in separate messages (e.g., one message with + thinking followed by another message with content). + """ + + role: str = Field(..., description="The role of the message sender (e.g., 'user', 'assistant', 'system')") + content: str | None = Field(None, description="The content of the message") + thinking: str | None = Field(None, description="Thinking/reasoning content") + tool_calls: list[ToolCall] | None = Field(None, description="Tool calls made in this message") + + @staticmethod + def _schema_extra(schema: dict[str, Any]) -> None: + """Add anyOf constraint requiring at least one of content, thinking, or tool_calls.""" + schema["anyOf"] = [ + { + "required": ["content"], + "properties": {"content": {"type": "string"}}, + "not": {"required": ["thinking"]}, + }, + { + "required": ["thinking"], + "properties": {"thinking": {"type": "string"}}, + "not": {"required": ["content"]}, + }, + {"required": ["tool_calls"], "properties": {"tool_calls": {"minItems": 1}}}, + ] + + model_config = ConfigDict(extra="forbid", json_schema_extra=_schema_extra) + + @model_validator(mode="after") + def check_has_content_or_thinking_or_tool_calls(self) -> "SFTChatMessage": + """Validate that message has at least one of content, thinking, or tool_calls. + + Also enforces that content and thinking are mutually exclusive - they cannot + both be present in the same message. + """ + if self.content is None and self.thinking is None and self.tool_calls is None: + raise ValueError("Message must have at least one of: content, thinking, or tool_calls") + + if self.content is not None and self.thinking is not None: + raise ValueError("Message cannot have both content and thinking - they are mutually exclusive") + + return self + + +class FunctionParameters(BaseModel): + """Parameters schema for a function definition. + + Example: + { + "type": "object", + "properties": { + "location": {"type": "string", "description": "The city name"} + } + } + """ + + type: Literal["object"] = Field(..., description="The type of parameters (must be 'object')") + properties: dict[str, Any] = Field(..., description="The properties/arguments the function accepts") + + +class FunctionDefinitionDetails(BaseModel): + """Details of a function definition for tool calling. + + Example: + { + "name": "get_weather", + "description": "Get the current weather for a location", + "parameters": {"type": "object", "properties": {...}}, + "required": ["location"] + } + """ + + name: str = Field(..., description="The name of the function") + description: str = Field(..., description="A description of what the function does") + parameters: FunctionParameters = Field(..., description="The parameters schema for the function") + required: list[str] | None = Field(None, description="List of required parameter names") + + +class ToolDefinition(BaseModel): + """A tool definition for function calling.""" + + type: Literal["function"] = Field(..., description="The type of tool (must be 'function')") + function: FunctionDefinitionDetails = Field( + ..., description="Function definition with name, description, and parameters" + ) + + +class SFTPChatDatasetItemSchema(BaseModel): + """Schema for SFT chat format based on MESSAGES_SCHEMA. + + This format represents conversations with message lists and optional tool definitions. + + Example: + { + "messages": [ + {"role": "user", "content": "What is 2+2?"}, + {"role": "assistant", "content": "4"} + ], + "tools": [...] # optional + } + """ + + messages: list[SFTChatMessage] = Field(..., description="List of messages in the conversation") + tools: list[ToolDefinition] | None = Field( + None, description="Optional tool definitions available in the conversation" + ) + + model_config = ConfigDict(extra="allow") + + +# Embedding Dataset Schemas +class EmbeddingDatasetItemSchema(BaseModel): + """Schema for embedding dataset format. + + Example: + { + "query": "What is machine learning?", + "pos_doc": "Machine learning is a branch of AI...", + "neg_doc": ["Deep learning is...", "Neural networks are..."] + } + """ + + query: str = Field(..., description="The query text") + pos_doc: str = Field(..., description="The positive document") + neg_doc: list[str] = Field(..., description="List of negative documents") + + model_config = ConfigDict(extra="allow") + + +def get_sft_dataset_discriminator(v: Any) -> str: + """Determine the SFT dataset schema type based on field presence. + + This discriminator examines the fields to determine format: + - "EmbeddingDatasetItemSchema": Has 'query', 'pos_doc', 'neg_doc' fields (embedding format) + - "SFTChatDatasetItemSchema": Has 'messages' field (chat format) + - "SFTPromptTemplateDatasetItemSchema": Has other fields (prompt template format) + + Args: + v: The data to discriminate (dict or model instance) + + Returns: + Schema type name identifying the format + """ + if isinstance(v, dict): + # Embedding format: has query, pos_doc, neg_doc fields + if "query" in v and "pos_doc" in v and "neg_doc" in v: + return "EmbeddingDatasetItemSchema" + + # Chat format: has messages array + if "messages" in v: + return "SFTChatDatasetItemSchema" + + # Prompt template format: has prompt/completion or custom fields + return "SFTPromptTemplateDatasetItemSchema" + + return "SFTPromptTemplateDatasetItemSchema" # Default fallback + + +# Union type for all SFT dataset formats +SFTDatasetSchemaType = Annotated[ + Union[ + Annotated[SFTPromptTemplateDatasetItemSchema, Tag(str(SFTPromptTemplateDatasetItemSchema.__name__))], + Annotated[SFTPChatDatasetItemSchema, Tag(str(SFTPChatDatasetItemSchema.__name__))], + Annotated[EmbeddingDatasetItemSchema, Tag(str(EmbeddingDatasetItemSchema.__name__))], + ], + Discriminator(get_sft_dataset_discriminator), +] diff --git a/services/automodel/src/nmp/automodel/tasks/training/datasets/validation.py b/services/automodel/src/nmp/automodel/tasks/training/datasets/validation.py new file mode 100644 index 00000000..3e4e7f20 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/datasets/validation.py @@ -0,0 +1,297 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. + +import json +import logging +import os +import re +from typing import Any, Callable, Optional + +import jsonschema +from jsonschema import exceptions +from nmp.automodel.entities.values import FinetuningType, TrainingType +from nmp.automodel.tasks.training.datasets.preparation import DatasetFormatError +from nmp.automodel.tasks.training.datasets.schemas import SFTDatasetSchemaType + +logger = logging.getLogger(__name__) + + +def SFT_SCHEMA(prompt_template: str | None = None): + """Generate JSON schema for SFT datasets. + + Uses the SFTDatasetSchemaType union which supports: + - SFTPromptTemplateDatasetItemSchema: Flexible prompt template format + - SFTPChatDatasetItemSchema: Chat format with messages and tools + + Args: + prompt_template: Optional template string with placeholders like "{input} {output}". + If None or empty string, defaults to standard prompt/completion format. + Ignored for chat format detection. + + Returns: + JSON schema dict with required fields based on the format. + """ + from pydantic import TypeAdapter + + # Determine required fields for prompt template format + if prompt_template is not None and prompt_template != "": + # Extract placeholders from template + found_keys = re.findall(r"{(.*?)}", prompt_template) + + # TODO: Are we constrained by len == 2? + # Check for duplicates + if len(found_keys) != len(set(found_keys)): + duplicates = [key for key in found_keys if found_keys.count(key) > 1] + unique_duplicates = list(dict.fromkeys(duplicates)) + raise ValueError( + f"Prompt template contains duplicate placeholders: {unique_duplicates}. " + f"Each placeholder should appear only once." + ) + + prompt_template_keys = found_keys + else: + prompt_template_keys = ["prompt", "completion"] + + # Create TypeAdapter for the SFT union type to generate base JSON schema + adapter = TypeAdapter(SFTDatasetSchemaType) + schema = adapter.json_schema() + + # Add JSON schema metadata + schema["$schema"] = "https://json-schema.org/draft/2020-12/schema" + schema["title"] = "SFT Schema" + + # Update the prompt template sub-schema with required fields from prompt_template_keys + # The schema structure has $defs with the actual schemas, and oneOf/anyOf with $ref pointers + if "$defs" in schema: + # Update the SFTPromptTemplateDatasetItemSchema in $defs + if "SFTPromptTemplateDatasetItemSchema" in schema["$defs"]: + template_schema = schema["$defs"]["SFTPromptTemplateDatasetItemSchema"] + # Add template fields as required properties + if "properties" not in template_schema: + template_schema["properties"] = {} + for key in prompt_template_keys: + template_schema["properties"][key] = {"type": "string"} + template_schema["required"] = prompt_template_keys + template_schema["additionalProperties"] = True + return schema + + +SCHEMAS: dict[str, Callable[[str | None], dict]] = { + TrainingType.SFT.value: SFT_SCHEMA, + TrainingType.DISTILLATION.value: SFT_SCHEMA, +} + + +class DatasetValidator: + """Validator for training datasets. + + This class encapsulates dataset validation logic and avoids parameter drilling + by storing configuration as instance attributes. + + Example usage after prepare_dataset(): + ```python + from nmp.automodel.tasks.training.datasets.preparation import prepare_dataset + from nmp.automodel.tasks.training.datasets.validation import DatasetValidator + + # Prepare datasets + prepared = prepare_dataset( + dataset_path=Path(customizer_config.dataset.path), + output_dir=workspace_dir / "dataset", + ) + + # Validate the prepared datasets + validator = DatasetValidator( + training_type=customizer_config.training.training_type, + finetuning_type=customizer_config.training.finetuning_type, + prompt_template=customizer_config.dataset.prompt_template, + ) + validator.validate_dataset(str(prepared.train_file)) + validator.validate_dataset(str(prepared.validation_file)) + ``` + """ + + def __init__( + self, + training_type: TrainingType, + finetuning_type: Optional[FinetuningType] = None, + prompt_template: str | None = None, + ): + """Initialize validator with training configuration. + + Args: + training_type: The type of training (SFT, distillation, etc.) + finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) + prompt_template: Optional prompt template for SFT datasets + """ + self.training_type = training_type + self.finetuning_type = finetuning_type + self.prompt_template = prompt_template + + def _validate_json_object(self, obj: dict, schema: dict[str, Any]) -> None: + """Validate a JSON object against a schema. + + Args: + obj: The JSON object to validate + schema: The JSON schema to validate against + + Raises: + TypeError: If validation fails + """ + try: + jsonschema.validate(instance=obj, schema=schema) + except exceptions.ValidationError as e: + logger.debug(f"Dataset Schema Validation failed: {str(e)}") + raise TypeError(f"Dataset Schema Validation failed: {e.message}") + except Exception as e: + logger.debug(f"Dataset Schema Validation failed: {str(e)}") + raise TypeError(f"Dataset Schema Validation failed: {e}") + + def detect_dataset_schema(self, file_path: str) -> str: + """Detect the dataset schema from the first line of the file. + + Args: + file_path: Path to the dataset file + + Returns: + Schema name (e.g., 'sft', 'dpo', 'chat') + + Raises: + DatasetFormatError: If file format is invalid or doesn't match any schema + """ + with open(file_path, "r", encoding="utf-8") as f: + line = f.readline() + + try: + obj: dict[str, Any] = json.loads(line) + except Exception as e: + logger.debug(f"{file_path} has entry which is not valid json. Error: {e}\n{line}") + raise DatasetFormatError(f"{file_path} has entry which is not a valid json: {e}") + + for schema_name, schema_factory in SCHEMAS.items(): + try: + validation_schema = schema_factory(self.prompt_template) + self._validate_json_object(obj, validation_schema) + except Exception as e: + logger.debug(f"Parsed jsonl line does not conform to schema {schema_name}. Error: {e}") + else: + logger.debug(f"Parsed jsonl line conforms to schema {schema_name}.") + return schema_name + + raise DatasetFormatError("Dataset does not match any supported format") + + def validate_dataset(self, file_path: str, dataset_type: Optional[str] = None) -> None: + """Validate a single dataset file. + + Args: + file_path: Path to the dataset file + dataset_type: Optional dataset type to validate against. If None, uses training type from config + + Raises: + DatasetFormatError: If dataset is empty or validation fails + """ + # Use provided dataset_type or fall back to training type from config + if dataset_type is None: + dataset_type = self.training_type.value + + schema_factory = SCHEMAS.get(dataset_type) + if not schema_factory: + # Skip validation for unsupported types + return + + if os.path.getsize(file_path) == 0: + raise DatasetFormatError(f"{file_path} is empty") + + validation_schema = schema_factory(self.prompt_template) + + # Validate each line in the JSONL file + with open(file_path, "r", encoding="utf-8") as jsonl_file: + for line in jsonl_file: + line = line.strip() + if not line: + continue + + try: + obj: dict[str, Any] = json.loads(line) + except Exception as e: + logger.debug(f"{file_path} has entry which is not valid json. Error: {e}\n{line}") + raise DatasetFormatError(f"{file_path} has entry which is not valid json: {e}") + + try: + self._validate_json_object(obj, validation_schema) + except Exception as e: + logger.debug( + f"Parsed jsonl line does not conform to schema {validation_schema}. Error: {e}. Object: {obj}" + ) + raise DatasetFormatError( + f"Parsed jsonl line does not conform to schema {validation_schema}. Error: {e}" + ) + + +# Backward compatibility: provide standalone functions that create a validator instance +def detect_dataset_schema( + file_path: str, + training_type: TrainingType, + finetuning_type: Optional[FinetuningType] = None, + prompt_template: str | None = None, +) -> str: + """Detect the dataset schema from the first line of the file. + + Args: + file_path: Path to the dataset file + training_type: The type of training (SFT, DPO, etc.) + finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) + prompt_template: Optional prompt template for SFT datasets + + Returns: + Schema name (e.g., 'sft', 'dpo', 'chat') + """ + validator = DatasetValidator(training_type, finetuning_type, prompt_template=prompt_template) + return validator.detect_dataset_schema(file_path) + + +def validate_dataset( + file_path: str, + training_type: TrainingType, + dataset_type: Optional[str] = None, + finetuning_type: Optional[FinetuningType] = None, + prompt_template: str | None = None, +) -> None: + """Validate a single dataset file. + + Args: + file_path: Path to the dataset file + dataset_type: Dataset type to validate against + training_type: The type of training (SFT, DPO, etc.) + finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) + prompt_template: Optional prompt template for SFT datasets + """ + validator = DatasetValidator(training_type, finetuning_type, prompt_template=prompt_template) + validator.validate_dataset(file_path, dataset_type) + + +def validate_datasets( + file_names: list[str], + training_type: TrainingType, + dataset_type: Optional[str] = None, + finetuning_type: Optional[FinetuningType] = None, + prompt_template: str | None = None, +) -> None: + """Validate a list of dataset files. + + Args: + file_names: List of dataset file paths to validate + dataset_type: Dataset type to validate against (sft, dpo, embedding) + training_type: The type of training (SFT, DPO, etc.) + finetuning_type: Optional finetuning type (LoRA, all_weights, etc.) + prompt_template: Optional prompt template for SFT datasets (ignored for other dataset types) + """ + validator = DatasetValidator(training_type, finetuning_type, prompt_template=prompt_template) + for file_name in file_names: + validator.validate_dataset(file_name, dataset_type) diff --git a/services/automodel/src/nmp/automodel/tasks/training/distributed.py b/services/automodel/src/nmp/automodel/tasks/training/distributed.py new file mode 100644 index 00000000..ebf0e06d --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/distributed.py @@ -0,0 +1,245 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Distributed training coordination utilities. + +Provides role detection and file-based barrier synchronization for multi-node +training where multiple pods/containers run the same entry point. +""" + +import logging +import os +import shutil +import time +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path + +logger = logging.getLogger(__name__) + +# Environment variables for distributed training injected by Volcano's pytorch plugin. +# Do not confuse these with the same env vars injected by torchrun. +# Here, WORLD_SIZE refers to number of nodes, while torchrun's WORLD_SIZE is the number of GPUs. +# RANK refers to the rank of the node, while torchrun's RANK is the global rank of the GPU. +RANK_ENVVAR = "RANK" +WORLD_SIZE_ENVVAR = "WORLD_SIZE" + + +class DistributedRole(Enum): + """Role of this node in distributed training.""" + + COORDINATOR = "coordinator" # Rank 0 - runs all phases + WORKER = "worker" # Rank > 0 - only participates in training + + +@dataclass +class DistributedContext: + """ + Distributed training context with file-based barrier synchronization. + + In multi-node training, all pods run the same entry point. This context + provides: + - Role detection (coordinator vs worker) based on RANK + - File-based barriers for cross-pod synchronization + + File barriers work by: + - Coordinator creates marker files to signal phase completion + - Workers poll for marker files before proceeding + - All ranks can sync via mutual signal-and-wait + + Attributes: + role: Whether this node is coordinator (rank 0) or worker + rank: This node's rank in the distributed job + world_size: Total number of nodes participating + barrier_dir: Directory for barrier marker files (on shared storage). + Must be provided by caller for multi-node; None for single-node. + """ + + role: DistributedRole + rank: int + world_size: int + barrier_dir: Path + _barrier_timeout: float = field(default=600.0, repr=False) + _poll_interval: float = field(default=0.5, repr=False) + + @classmethod + def from_env(cls, barrier_dir: Path) -> "DistributedContext": + """ + Create distributed context from environment variables. + + The caller is responsible for constructing the barrier_dir path, + including any task-specific namespacing for pause/resume support. + + Args: + barrier_dir: Directory for barrier files (on shared storage). + Caller should namespace this by task ID for pause/resume support. + + Environment Variables: + RANK: This node's rank (default: 0) + WORLD_SIZE: Total number of nodes (default: 1) + + Returns: + Configured DistributedContext + """ + rank = int(os.environ.get(RANK_ENVVAR, "0")) + world_size = int(os.environ.get(WORLD_SIZE_ENVVAR, "1")) + + role = DistributedRole.COORDINATOR if rank == 0 else DistributedRole.WORKER + + # Setup barrier directory if distributed + if world_size > 1: + # Coordinator cleans up stale barriers from previous task runs + # (e.g., after pause/resume or retry). This must happen before + # workers start waiting, so we do it here at initialization. + if role == DistributedRole.COORDINATOR and barrier_dir.exists(): + logger.info(f"Cleaning up stale barriers from previous run: {barrier_dir}") + shutil.rmtree(barrier_dir, ignore_errors=True) + + barrier_dir.mkdir(parents=True, exist_ok=True) + + ctx = cls( + role=role, + rank=rank, + world_size=world_size, + barrier_dir=barrier_dir, + ) + + logger.info( + f"Distributed context: rank={rank}, world_size={world_size}, " + f"role={role.value}, barriers={'enabled' if ctx.is_distributed else 'disabled'}" + ) + + return ctx + + @property + def is_coordinator(self) -> bool: + """True if this is the coordinator node (rank 0).""" + return self.role == DistributedRole.COORDINATOR + + @property + def is_distributed(self) -> bool: + """True if running in multi-node mode.""" + return self.world_size > 1 + + # --- Barrier Implementation --- + + def _marker_path(self, barrier_name: str, rank: int) -> Path: + """Get path to barrier marker file for a specific rank.""" + return self.barrier_dir / f"{barrier_name}.rank{rank}.ready" + + def signal(self, barrier_name: str) -> None: + """ + Signal that this rank has reached a synchronization point. + + Creates a marker file indicating this rank is ready. + + Args: + barrier_name: Name of the barrier (should be unique per sync point) + """ + if not self.is_distributed: + return + + marker = self._marker_path(barrier_name, self.rank) + marker.touch() + logger.debug(f"Barrier signal: {barrier_name} (rank {self.rank})") + + def wait_for_coordinator(self, barrier_name: str, timeout: float | None = None) -> None: + """ + Wait for the coordinator (rank 0) to signal. + + Used by workers to wait for coordinator to complete a phase. + + Args: + barrier_name: Name of the barrier to wait for + timeout: Override default timeout (seconds) + + Raises: + TimeoutError: If coordinator doesn't signal within timeout + """ + if not self.is_distributed: + return + + if self.is_coordinator: + # Coordinator doesn't wait for itself + return + + timeout = timeout or self._barrier_timeout + marker = self._marker_path(barrier_name, rank=0) + start = time.time() + + logger.debug(f"Waiting for coordinator at barrier: {barrier_name}") + + while time.time() - start < timeout: + if marker.exists(): + logger.debug(f"Coordinator signaled barrier: {barrier_name}") + return + time.sleep(self._poll_interval) + + raise TimeoutError(f"Timeout waiting for coordinator at barrier '{barrier_name}' after {timeout}s") + + def wait_all(self, barrier_name: str, timeout: float | None = None) -> None: + """ + Wait for all ranks to reach this barrier. + + All ranks must call signal() before any rank proceeds. + + Args: + barrier_name: Name of the barrier + timeout: Override default timeout (seconds) + + Raises: + TimeoutError: If not all ranks signal within timeout + """ + if not self.is_distributed: + return + + timeout = timeout or self._barrier_timeout + start = time.time() + + logger.debug(f"Waiting for all ranks at barrier: {barrier_name}") + + while time.time() - start < timeout: + ready_count = sum(1 for r in range(self.world_size) if self._marker_path(barrier_name, r).exists()) + if ready_count >= self.world_size: + logger.debug(f"All ranks reached barrier: {barrier_name}") + return + time.sleep(self._poll_interval) + + # Report which ranks are missing for debugging + missing = [r for r in range(self.world_size) if not self._marker_path(barrier_name, r).exists()] + raise TimeoutError(f"Timeout at barrier '{barrier_name}' after {timeout}s. Missing ranks: {missing}") + + def sync_point(self, barrier_name: str, timeout: float | None = None) -> None: + """ + Synchronization point where all ranks must arrive before any proceed. + + Combines signal() and wait_all() - this rank signals and then waits + for all other ranks. + + Args: + barrier_name: Name of the sync point + timeout: Override default timeout (seconds) + """ + self.signal(barrier_name) + self.wait_all(barrier_name, timeout) + + def cleanup_barrier(self, barrier_name: str) -> None: + """ + Clean up barrier marker files (coordinator only). + + Call after all ranks have passed the barrier. + + Args: + barrier_name: Name of the barrier to clean up + """ + if not self.is_distributed or not self.is_coordinator: + return + + for r in range(self.world_size): + marker = self._marker_path(barrier_name, r) + try: + if marker.exists(): + marker.unlink() + except OSError as e: + logger.warning(f"Failed to clean up barrier marker {marker}: {e}") diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/converter.py b/services/automodel/src/nmp/automodel/tasks/training/errors/converter.py new file mode 100644 index 00000000..d06e632e --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/errors/converter.py @@ -0,0 +1,119 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import subprocess +from pathlib import Path + +from nmp.common.errors import ExceptionConverter, RulesLoader + +from .exceptions import ( + EXCEPTION_REGISTRY, + CustomizerTrainingError, + ErrorDetails, + InternalError, + default_exception_handler, +) + +logger = logging.getLogger(__name__) + +# Path to the error rules YAML file (relative to this module) +_ERROR_RULES_PATH = Path(__file__).parent / "error_rules.yaml" + +# Additional modules to search for exception types not in the registry +# subprocess.TimeoutExpired is used for training timeout detection +_FALLBACK_MODULES = [subprocess] + +# Module-level singleton converter +_converter: ExceptionConverter | None = None + + +def _load_converter() -> ExceptionConverter: + """Load the converter from YAML rules.""" + logger.debug(f"Loading Customizer error rules from: {_ERROR_RULES_PATH}") + + converter = RulesLoader.from_yaml( + _ERROR_RULES_PATH, + exception_registry=EXCEPTION_REGISTRY, + default_handler=default_exception_handler, + fallback_exception=InternalError, + fallback_modules=_FALLBACK_MODULES, + ) + + logger.info(f"Loaded {converter.rule_count} Customizer error mapping rules") + return converter + + +def get_error_converter() -> ExceptionConverter: + """ + Get the singleton ExceptionConverter for Customizer training errors. + + The converter is created once on first access and reused for the module's lifetime. + It loads rules from error_rules.yaml and uses InternalError as fallback. + + Returns: + Configured ExceptionConverter ready to convert exceptions. + + Raises: + FileNotFoundError: If error_rules.yaml is not found. + ValueError: If rules file has invalid syntax. + """ + global _converter + if _converter is None: + _converter = _load_converter() + return _converter + + +def create_error_details(exception: Exception) -> ErrorDetails: + """ + Create error_details dict for Jobs service reporting. + + Converts the exception to a CustomizerTrainingError and returns + a dict suitable for passing to progress_reporter.report_error(). + + If the exception is already a CustomizerTrainingError, returns its + details directly without re-conversion. + + Uses the library's fallback mechanism (InternalError) for unmatched exceptions. + + Args: + exception: The exception to convert. + + Returns: + ErrorDetails with 'message', 'type', and 'detail' keys. + """ + # If already a CustomizerTrainingError, return its details directly + if isinstance(exception, CustomizerTrainingError): + return exception.to_error_details() + + # Convert using the library - fallback_exception=InternalError handles unmatched + converter = get_error_converter() + try: + converter.raise_converted_or_default(exception) + except CustomizerTrainingError as converted: + return converted.to_error_details() + except Exception as e: # noqa: BLE001 - intentional last-resort guard to guarantee dict return + # Unexpected exception type - wrap in InternalError to ensure we always return a dict + logger.warning(f"Unexpected exception type from converter: {type(e).__name__}: {e}") + exc = InternalError( + message=f"An internal error occurred. ({type(exception).__name__}: {exception})", + detail=str(exception), + ) + return exc.to_error_details() + + # Defensive fallback: if converter unexpectedly does not raise, still return valid details + logger.warning( + "Converter returned without raising for exception type %s; using InternalError fallback.", + type(exception).__name__, + ) + exc = InternalError( + message=f"An internal error occurred. ({type(exception).__name__}: {exception})", + detail=str(exception), + ) + return exc.to_error_details() + + +__all__ = [ + "get_error_converter", + "create_error_details", +] diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/error_rules.yaml b/services/automodel/src/nmp/automodel/tasks/training/errors/error_rules.yaml new file mode 100644 index 00000000..1a16801b --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/errors/error_rules.yaml @@ -0,0 +1,643 @@ +# This file defines rules for converting low-level training exceptions +# into user-friendly CustomizerTrainingError subclasses. +# Rules for all backends are present in the same yaml file. +# +# Rules are evaluated in order; first match wins. +# +# Rule structure: +# - : # When to match (pick ONE) +# exception: # Exception class from EXCEPTION_REGISTRY +# error_details: # Optional user-friendly message + + +rules: + # =========================================================================== + # 1. TRAINING TIMEOUT (subprocess.TimeoutExpired) + # All backends + # =========================================================================== + + - type: TimeoutExpired # subprocess.TimeoutExpired from fallback_modules + exception: TrainingTimeoutError + error_details: "Training exceeded the maximum allowed time limit. To reduce training time: 1) Reduce max_steps or epochs, 2) Use a smaller dataset, 3) Use a smaller model, 4) Use LoRA/PEFT instead of all_weights fine-tuning (LoRA trains faster), or 5) Increase batch_size to process more samples per step (if GPU memory allows). If you need longer training times, contact your administrator to adjust the job timeout limits." + + # =========================================================================== + # 2. DATASET FORMAT ERRORS (400) + # =========================================================================== + + # --- Automodel --- + # Unsupported role in chat messages + - regex: "Unsupported role in messages: \\w+" + exception: DatasetFormatError + error_details: "Your dataset contains chat messages with an invalid role. Each message in a conversation must have a 'role' field with one of the following values: 'system' (for system prompts), 'user' (for user inputs), 'assistant' (for model responses), or 'tool' (for tool/function outputs). Please check your dataset and ensure all messages use valid roles." + + # --- NeMo-RL --- + # Text type error + - regex: "^text must be a string or a list of strings, got .+$" + exception: DatasetFormatError + error_details: "The 'text' field in your dataset has an invalid type. For NeMo-RL training (DPO/GRPO), the text field must be either a single string or a list of strings. Please check your dataset format and ensure the text field contains the correct data type." + + # Prompt file not found + - regex: "^Prompt file .+ not found$" + exception: DatasetFormatError + error_details: "The prompt template file specified in your training dataset configuration does not exist. Prompt templates define how your dataset samples are formatted for training. Please verify the prompt file path is correct and the file is accessible at the specified location." + + # --- Automodel --- + # Empty dataset + - regex: "^no sample to consume: \\d+$" + exception: DatasetFormatError + error_details: "Your dataset is empty or contains zero valid samples after filtering. This can happen if: 1) The dataset file is empty, 2) All samples were filtered out due to format issues, or 3) The dataset path is incorrect. Please verify your dataset contains valid training samples." + + # All samples consumed + - regex: "^no samples left to consume: \\d+, \\d+$" + exception: DatasetFormatError + error_details: "All samples in your dataset have been consumed before completing the requested number of training steps. This happens when your dataset is too small for the configured epochs or max_steps. Please either: 1) Add more samples to your dataset, 2) Reduce the number of epochs, or 3) Reduce max_steps." + + # Error loading example + - regex: "Error while loading example \\d+ from dataset .+" + exception: DatasetFormatError + error_details: "Failed to load a specific sample from your dataset. This typically indicates a malformed sample that doesn't match the expected format. Please check your dataset for: 1) Missing required fields, 2) Invalid JSON formatting, 3) Incorrect data types for fields. The error message includes the sample index to help you locate the problematic entry." + + # =========================================================================== + # 3. MODEL NOT FOUND ERRORS (404) + # Megatron Bridge + # =========================================================================== + + # Checkpoint file not found (input model checkpoint for training) + - regex: "^Checkpoint file not found: .+$" + exception: ModelNotFoundError + error_details: "The input model checkpoint file could not be found. Please verify the base model path is correct and accessible. This checkpoint is used as the starting point for training." + + # No checkpoints found for resume (output checkpoint directory empty) + - regex: "There were no checkpoints found in checkpoint_dir.*Cannot resume" + exception: ModelNotFoundError + error_details: "The output checkpoint directory is empty. Cannot resume training because no previous training checkpoints were found. Ensure a prior training run completed successfully and saved checkpoints." + + # Nemotron model missing HF source + - regex: "Nemotron Super models expect HF source code to exist at .+" + exception: ModelNotFoundError + error_details: "The Nemotron Super model checkpoint is missing the required HuggingFace source code directory (nemotron_src/). This directory must be present inside the model checkpoint. Please ensure you are using a complete Nemotron Super model checkpoint that includes the HuggingFace source files." + + # =========================================================================== + # 4. MODEL LOAD ERRORS (500) + # =========================================================================== + + # --- Automodel --- + # Model weights swap failure + - contains: "_apply(): Couldn't swap" + exception: ModelLoadError + error_details: "Failed to load the base model: weights could not be applied to a model layer. The base model checkpoint may be corrupted, incomplete, or incompatible with the selected training configuration." + + # Model patching failure + - exact: "Failed to patch model" + exception: ModelLoadError + error_details: "Failed to apply optimizations to the base model. The base model architecture may not be supported for the selected training configuration. Try using a different model or training method." + + # Method signature mismatch + - starts_with: "Signature mismatch:" + exception: ModelLoadError + error_details: "The base model has an incompatible method signature. This typically indicates a version mismatch between the base model and the training framework. Please verify you are using a supported model version." + + # Missing lm_head.weight + - exact: "lm_head.weight not found in model" + exception: ModelLoadError + error_details: "The base model is missing the language model head (lm_head.weight). The base model checkpoint may be corrupted, incomplete, or not a valid language model. Please verify the base model is a complete, valid language model checkpoint." + + # --- NeMo-RL --- + # vLLM not installed + - contains: "vLLM is not installed" + exception: ModelLoadError + error_details: "vLLM is not installed in the training environment. This is an issue with the training environment setup, please contact the administrator to raise an issue with the NeMo Platform team." + + # Missing generation output keys + - regex: "^Missing required keys for GenerationOutputSpec: .+$" + exception: ModelLoadError + error_details: "The base model's generation output is missing required fields. The base model may not be compatible with the selected training method (e.g., GRPO). Please verify you are using a supported model for this training type." + + # Missing score output keys + - regex: "^Missing required keys for ScoreOutputSpec: .+$" + exception: ModelLoadError + error_details: "The base model's score output is missing required fields. The base model may not be compatible with the selected training method. Please verify you are using a supported model for this training type." + + # Pretrained run config not found (Megatron HF-to-mcore conversion) + - contains: "Pretrained run config not found at" + exception: ModelLoadError + error_details: "The pretrained model configuration file was not found after Megatron checkpoint conversion. This usually means the HuggingFace-to-Megatron conversion on the head node saved to a directory not accessible by this worker node. This is an infrastructure issue - please ensure shared storage is properly mounted across all nodes, or contact your administrator." + + # --- Megatron Bridge --- + # Shape mismatch for parameter + - regex: "^Shape mismatch for parameter .+: target shape .+ vs source shape .+$" + exception: ModelLoadError + error_details: "The base model parameter shape does not match the checkpoint. The base model checkpoint may be from a different model architecture or an incompatible version. Please ensure the base model matches the expected architecture for this training configuration." + + # Shape mismatch for buffer + - regex: "^Shape mismatch for buffer .+: .+ vs .+$" + exception: ModelLoadError + error_details: "The base model buffer shape does not match the checkpoint. The base model checkpoint may be corrupted, incomplete, or from an incompatible model version. Please verify the base model checkpoint is valid and complete." + + # =========================================================================== + # 5. TRAINING CONFIG ERRORS - PARALLELISM (400) + # =========================================================================== + + # --- Automodel --- + # Pipeline parallelism: tied embeddings not supported + - all_keywords: ["not compatible with pipeline parallelism", "tie_word_embeddings"] + exception: TrainingConfigError + error_details: "The base model has tied embeddings (tie_word_embeddings=True) which is not compatible with pipeline parallelism. Try using a different parallelism configuration or a model without tied embeddings." + + # Pipeline parallelism: encoder-decoder models not supported + - all_keywords: ["not compatible with pipeline parallelism", "Encoder-Decoder"] + exception: TrainingConfigError + error_details: "The base model is an encoder-decoder architecture (like T5 or BART) which is not supported with pipeline parallelism. Please use a decoder-only base model, or disable pipeline parallelism in your training configuration." + + # PP batch size / microbatch validation + - contains: "pp_batch_size // pp_microbatch_size must be >= pp_size" + exception: TrainingConfigError + error_details: "Pipeline parallelism requires: batch_size >= pipeline_parallel_size. The current batch_size is too small to fill all pipeline stages. Either increase batch_size or reduce pipeline_parallel_size." + + # Context parallelism: SDPA not supported + - contains: "Model does not support SDPA required for context parallelism" + exception: TrainingConfigError + error_details: "The base model does not support scaled dot-product attention (SDPA) which is required for context parallelism. Please set context_parallel_size=1 to disable context parallelism." + + # --- NeMo-RL --- + # Megatron and DTensor both enabled + - exact: "Configure either Megatron (policy.megatron_cfg.enabled=true) or DTensor (policy.dtensor_cfg.enabled=true), not both." + exception: TrainingConfigError + error_details: "Internal configuration error: both Megatron and DTensor training backends are enabled, but only one can be active at a time. This is an issue with the training environment setup, please contact the administrator." + + # Neither Megatron nor DTensor enabled + - contains: "Please either set policy.megatron_cfg.enabled=true" + exception: TrainingConfigError + error_details: "Internal configuration error: no training backend is enabled. The training environment requires either Megatron or DTensor backend to be active. This is an issue with the training environment setup, please contact the administrator." + + # World size insufficient for parallelism + - regex: "^World size \\(\\d+\\) is insufficient for the parallelism configuration" + exception: TrainingConfigError + error_details: "Not enough GPUs available for the requested parallelism settings. The total number of GPUs must be at least pipeline_parallel_size * context_parallel_size * tensor_parallel_size. Either reduce parallelism settings or request more GPUs." + + # World size not divisible by parallelism + - regex: "^World size \\(\\d+\\) must be divisible by PP \\* CP \\* TP" + exception: TrainingConfigError + error_details: "The total number of GPUs must be evenly divisible by (pipeline_parallel_size * context_parallel_size * tensor_parallel_size). For example, with PP=2, CP=1, TP=2, you need 4, 8, 12, etc. GPUs. Please adjust your parallelism settings or cluster size." + + # DTensor world size mismatch + - regex: "^World size\\(\\d+\\) must equal to dp_size\\(\\d+\\) \\* tp_size\\(\\d+\\) \\* cp_size\\(\\d+\\) to use DTensor$" + exception: TrainingConfigError + error_details: "The total number of GPUs (world_size) does not match the product of data_parallel_size * tensor_parallel_size * context_parallel_size for the DTensor backend. Please adjust your parallelism settings so they are consistent with the available GPU count." + + # Dynamic batching with PP > 1 + - contains: "Dynamic batching is only supported for single pipeline parallel stage" + exception: TrainingConfigError + error_details: "Dynamic batching is only supported when pipeline_parallel_size=1. With pipeline parallelism (PP > 1), the model is split across GPU stages which requires fixed batch sizes. Please either set pipeline_parallel_size=1 or disable dynamic batching." + + # Dynamic batching exclusive of sequence packing + - contains: "Dynamic Batching is exclusive of Sequence Packing" + exception: TrainingConfigError + error_details: "Dynamic batching and sequence packing cannot be used together. Please disable one of them: either set dynamic_batching=false or set sequence_packing_enabled=false." + + # Sequence packing not supported for VLM models + - contains: "Sequence packing is not supported for VLM models" + exception: TrainingConfigError + error_details: "Sequence packing is not supported for Vision-Language Models (VLMs). Please set sequence_packing_enabled=false when training VLM models." + + # Context parallel not supported for sequence packing (DTensor) + - exact: "Context parallel is not supported for sequence packing. Refer to https://github.com/NVIDIA/NeMo-RL/blob/main/docs/model-quirks.md#context-parallel-with-fsdp2 for more details." + exception: TrainingConfigError + error_details: "Context parallelism cannot be used with sequence packing in the DTensor backend. Please either set context_parallel_size=1 to disable context parallelism, or set sequence_packing_enabled=false to disable sequence packing." + + # Context parallel not supported for Gemma3 + - contains: "Context parallel is not supported for Gemma3ForCausalLM" + exception: TrainingConfigError + error_details: "Context parallelism is not supported for Gemma3 models due to limitations in the PyTorch context parallel implementation. Please set context_parallel_size=1 when training Gemma3 models." + + # Context parallel not supported for VLM models + - contains: "Context parallel is yet not supported for VLM models" + exception: TrainingConfigError + error_details: "Context parallelism is not yet supported for Vision-Language Models (VLMs). Please set context_parallel_size=1 when training VLM models." + + # Context parallelism requires sequence packing (Megatron) + - contains: "Context Parallelism (CP>1) requires sequence packing to be enabled" + exception: TrainingConfigError + error_details: "When using the Megatron backend with context_parallel_size > 1, sequence packing must be enabled. Please either enable sequence packing (sequence_packing_enabled=true) or reduce context_parallel_size to 1." + + # Reward models not supported with Megatron backend + - contains: "Reward models are not yet supported with the Megatron backend" + exception: TrainingConfigError + error_details: "Reward models are not yet supported with the Megatron training backend. This is a current limitation of the framework. Please use the DTensor backend for reward model training, or contact your administrator for alternative configurations." + + # Dynamic sampling max batches reached + - contains: "Dynamic sampling has reached the maximum allowed number of batches" + exception: TrainingConfigError + error_details: "Dynamic sampling exceeded the maximum number of generation batches allowed per training step. This means the training data or reward signal is too challenging for the model to produce enough valid samples. Consider: 1) Simplifying your dataset, 2) Adjusting num_prompts_per_step or num_generations_per_prompt, 3) Checking that your reward function is not too strict." + + # Batch size not divisible by DP + - regex: "Configuration error: \\(num_prompts_per_step \\* num_generations_per_prompt\\) = \\d+ must be divisible by data_parallel size \\d+" + exception: TrainingConfigError + error_details: "The effective batch size (num_prompts_per_step * num_generations_per_prompt) must be evenly divisible by the number of data parallel workers. Please adjust num_prompts_per_step or num_generations_per_prompt so their product divides evenly." + + # =========================================================================== + # 6. TRAINING CONFIG ERRORS - DPO/GRPO (400) + # NeMo-RL + # =========================================================================== + + # Dynamic batching with DPO + - contains: "Dynamic batching is currently not supported with DPO" + exception: TrainingConfigError + error_details: "DPO (Direct Preference Optimization) training does not support dynamic batching. This is an internal configuration issue with the training environment, please contact the administrator." + + # Sequence packing with DPO + - contains: "Sequence packing is currently not supported with DPO" + exception: TrainingConfigError + error_details: "DPO (Direct Preference Optimization) training does not support sequence packing. Please set sequence_packing_enabled=false in your training request." + + # GRPO requires generation config + - contains: "A generation config in the PolicyConfig is required for GRPO" + exception: TrainingConfigError + error_details: "GRPO (Group Relative Policy Optimization) requires a generation configuration to produce responses during training. This is an internal configuration issue with the training environment, please contact the administrator." + + # Validation dataset required + - exact: "Validation dataset is required if validation is enabled" + exception: TrainingConfigError + error_details: "Validation is enabled for this training job, but no validation dataset was provided. Please provide a validation dataset in your training request, or disable validation." + + # Non-colocated inference with Megatron + - contains: "Non-colocated inference is not supported for Megatron generation backends" + exception: TrainingConfigError + error_details: "The current training configuration uses Megatron for generation, which does not support the required inference mode. This is an internal configuration issue with the training environment, please contact the administrator." + + # Async GRPO requires vLLM async + - contains: "Async GRPO requires vLLM backend with vllm_cfg.async_engine=True" + exception: TrainingConfigError + error_details: "Async GRPO training requires the vLLM backend with async engine enabled, but the current configuration does not have this set. This is an internal configuration issue with the training environment, please contact the administrator." + + # Async GRPO requires importance sampling + - contains: "Importance sampling correction must be enabled for async GRPO" + exception: TrainingConfigError + error_details: "Async GRPO training requires importance sampling correction to handle off-policy samples and ensure stable training. This is an internal configuration issue with the training environment, please contact the administrator." + + # Async GRPO doesn't support colocated inference + - contains: "Colocated inference is not supported for async GRPO" + exception: TrainingConfigError + error_details: "Async GRPO training does not support colocated inference (running training and generation on the same GPUs). This is an internal configuration issue with the training environment, please contact the administrator." + + # top_k sampling threshold (vLLM V1 engine limitation) + - contains: "top_k sampling with values <" + exception: TrainingConfigError + error_details: "The top_k value is too low for the vLLM V1 engine. The vLLM V1 engine does not return logprobs after top_k filtering, so very low top_k values produce inaccurate logprob computations. Please increase top_k or remove the top_k constraint." + + # top_p sampling threshold (vLLM V1 engine limitation) + - contains: "top_p sampling with values <" + exception: TrainingConfigError + error_details: "The top_p value is too low for the vLLM V1 engine. The vLLM V1 engine does not return logprobs after top_p filtering, so very low top_p values produce inaccurate logprob computations. Please increase top_p or remove the top_p constraint." + + # MoE aux loss not supported + - contains: "MoE aux loss is currently not supported" + exception: TrainingConfigError + error_details: "Mixture-of-Experts (MoE) auxiliary loss is not currently supported due to a known bug in Megatron-LM. Please disable the MoE auxiliary loss in your training configuration." + + # =========================================================================== + # 7. TRAINING CONFIG ERRORS - PEFT/LORA (400) + # Automodel + # =========================================================================== + + # Triton not installed + - contains: "triton is not installed" + exception: TrainingConfigError + error_details: "The Triton library, which is required for optimized LoRA kernel operations, is not installed in the training environment. This is an issue with the training environment setup, please contact the administrator to ensure Triton is properly installed." + + # LoRA dimensions mismatch + - contains: "Incompatible X and LoRA A dimensions" + exception: TrainingConfigError + error_details: "The LoRA adapter dimensions are incompatible with the base model's layer dimensions. This can happen if you are trying to apply a pre-trained LoRA adapter that was created for a different model architecture. Please ensure the LoRA configuration (lora_dim/rank) is compatible with the base model you are fine-tuning." + + # =========================================================================== + # 8. TRAINING CONFIG ERRORS - PACKING (400) + # NeMo-RL + # =========================================================================== + + # Sequence too long for packing + - regex: "^Sequence length \\d+ exceeds bin capacity \\d+$" + exception: TrainingConfigError + error_details: "When sequence packing is enabled, one or more sequences in your dataset exceed the maximum sequence length (max_seq_length). Sequence packing combines multiple shorter sequences into a single training sample, but each individual sequence must fit within max_seq_length. Please either increase max_seq_length to accommodate longer sequences, or preprocess your dataset to truncate or remove sequences that are too long." + + # Not enough sequences for packing + - regex: "^Cannot create \\d+ bins with only \\d+ sequences" + exception: TrainingConfigError + error_details: "When sequence packing is enabled, the packing algorithm needs enough sequences to efficiently fill the training batches. Your dataset does not have enough sequences for the current batch configuration. Please either add more samples to your dataset, reduce the batch_size, or disable sequence packing by setting sequence_packing_enabled=false." + + # =========================================================================== + # 9. ENVIRONMENT ERRORS (400) + # NeMo-RL + # =========================================================================== + + # Unable to find compatible environment + - regex: "^Unable to find compatible environment - .+$" + exception: TrainingEnvironmentError + error_details: "The specified GRPO environment name is not recognized. GRPO (Group Relative Policy Optimization) requires a valid environment that defines how to evaluate model responses. Please check the environment name in your training request and ensure it matches one of the supported environments for your use case." + + # GRPO environment required + - exact: "hyperparameters.environment is required for GRPO, but it is not set" + exception: TrainingEnvironmentError + error_details: "GRPO (Group Relative Policy Optimization) training requires an environment configuration to evaluate model responses and compute rewards. Please specify the environment in your training request's hyperparameters. The environment determines how the model's generated responses will be scored during reinforcement learning." + + # No environment for task type + - regex: "^No environment found for task type: .+$" + exception: TrainingEnvironmentError + error_details: "No GRPO environment is registered for the specified task type. The environment defines how model responses are evaluated during reinforcement learning. This may indicate an unsupported task type or a misconfiguration. Please verify your task type is supported for GRPO training." + + # =========================================================================== + # 10. CHECKPOINT ERRORS (500) + # =========================================================================== + + # --- Automodel --- + # Checkpoint directory already exists + - regex: "Checkpoint directory .* already exists" + exception: CheckpointError + error_details: "The output checkpoint directory already exists from a previous training run. This typically happens when a previous training job failed or was cancelled but left partial checkpoint files behind. Please use a clean output directory, if you do not have access to remove the existing checkpoint directory, contact your administrator." + + # Global plan validation failure + - exact: "Failed to validate global plan" + exception: CheckpointError + error_details: "Checkpoint validation failed during distributed checkpoint loading. This occurs when the 'global plan' (which coordinates how model weights are distributed across GPUs) cannot be validated. Common causes include: 1) Corrupted checkpoint metadata files, 2) Mismatch between the number of GPUs used when saving vs loading the checkpoint, or 3) Interrupted checkpoint save operation. Please ensure the checkpoint is complete and you are using the same GPU topology as when the checkpoint was saved." + + # Missing key in checkpoint + - starts_with: "Missing key in checkpoint state_dict:" + exception: CheckpointError + error_details: "The checkpoint is missing one or more required model weights. This typically indicates that the checkpoint file is corrupted, incomplete (possibly from an interrupted save), or was created from a different model architecture than the one being loaded. Please verify the checkpoint is complete and matches the expected model architecture." + + # MoE expert weights missing + - contains: "Expert weights missing from checkpoint" + exception: CheckpointError + error_details: "The checkpoint for this Mixture-of-Experts (MoE) model is missing one or more expert weights. MoE models have multiple 'expert' sub-networks, and all expert weights must be present in the checkpoint. This typically indicates the checkpoint is corrupted or was saved incorrectly. Please use a complete, valid MoE checkpoint." + + # --- NeMo-RL --- + # Checkpoint file corrupted (JSONDecodeError) + - type_name: JSONDecodeError + exception: CheckpointError + error_details: "The checkpoint metadata file (training_info.json) is corrupted and cannot be parsed. This file stores training progress information like the current step and loss values. The checkpoint may have been saved incompletely or the file was corrupted during storage." + + # Distributed process group not initialized for checkpoint save + - exact: "Distributed process group is not initialized. Cannot save checkpoint." + exception: CheckpointError + error_details: "Cannot save checkpoint because the distributed process group is not initialized. This typically occurs when the training cluster encountered communication issues before checkpoint saving could complete. This is a transient infrastructure issue - please try running your training job again." + + # Megatron core state not initialized for checkpoint save + - exact: "Megatron core state or model is not initialized. Cannot save checkpoint." + exception: CheckpointError + error_details: "Cannot save checkpoint because the Megatron model state is not initialized. This typically occurs when the model failed to load or initialize correctly before training could produce a checkpoint. Please verify the base model is valid and try again." + + # HF checkpoint already exists + - regex: "^HF checkpoint already exists at .+\\. Delete it to run or set overwrite=True\\.$" + exception: CheckpointError + error_details: "The HuggingFace checkpoint output directory already exists from a previous training run or conversion. This typically happens when a previous training job left partial output behind. Please use a clean output directory, or contact your administrator to remove the existing checkpoint." + + # =========================================================================== + # 11. CUDA/GPU ERRORS (500) + # =========================================================================== + + # --- NeMo-RL --- + # Disk space exhausted - occurs in Ray cluster workers during RL training + # Ray stores session logs in /tmp/ray/session_*/logs/ which can fill up ephemeral node storage + - contains: "No space left on device" + exception: DistributedError + error_details: "Disk space exhausted on the node's ephemeral storage (/tmp). During reinforcement learning training (DPO/GRPO), Ray stores session logs and temporary files in /tmp/ray/ which can fill up the node's local disk. This is separate from the PVC used for checkpoints and datasets. This is typically a transient infrastructure issue - please try running your training job again, or contact your administrator to ensure adequate ephemeral storage is configured for the cluster nodes." + + # CUDA out of memory - catch by type name + - type_name: OutOfMemoryError + exception: CudaError + error_details: "GPU out of memory. To reduce memory usage: 1) Lower batch_size, 2) Reduce max_seq_length, 3) Use LoRA/PEFT instead of all_weights fine-tuning, or 4) Use a model with fewer parameters." + + # CUDA OOM - catch by message pattern + - contains: "CUDA out of memory" + exception: CudaError + error_details: "GPU out of memory. To reduce memory usage: 1) Lower batch_size, 2) Reduce max_seq_length, 3) Use LoRA/PEFT instead of all_weights fine-tuning, or 4) Use a model with fewer parameters." + + # OOM keyword + - contains: "out of memory" + exception: CudaError + error_details: "GPU out of memory. To reduce memory usage: 1) Lower batch_size, 2) Reduce max_seq_length, 3) Use LoRA/PEFT instead of all_weights fine-tuning, or 4) Use a model with fewer parameters." + + # General CUDA errors + - and: + - any_keywords: ["CUDA", "cuda"] + - any_keywords: ["error", "Error", "failed", "Failed"] + exception: CudaError + error_details: "A GPU/CUDA error occurred. Please check GPU availability, ensure the GPU is not being used by another process, and try again." + + # =========================================================================== + # 12. DISTRIBUTED ERRORS (500) + # =========================================================================== + + # --- Automodel --- + # torch.distributed not available + - exact: "torch.distributed not available" + exception: DistributedError + error_details: "The PyTorch distributed package is not available in the training environment. Distributed training requires PyTorch to be built with distributed support enabled. This is an issue with the training environment setup, please contact the administrator to ensure the correct PyTorch version is installed." + + # torch.distributed not initialized + - exact: "expected torch.distributed to be initialized" + exception: DistributedError + error_details: "PyTorch distributed training was not properly initialized before the training process started. This typically happens when the training script is not launched correctly with the distributed launcher (torchrun). This is an issue with the training environment setup, please contact the administrator." + + # Distributed timeout - check for TimeoutError in cause chain + - cause: + type_name: TimeoutError + recursive: true + exception: DistributedError + error_details: "A distributed training operation timed out while waiting for communication between GPUs or nodes. This can happen when: 1) One or more GPU workers crashed or became unresponsive, 2) Network connectivity issues between nodes, 3) Uneven workload causing some GPUs to wait too long for others. This may be a transient issue - please try running your training job again. If the problem persists, contact your administrator." + + # NCCL errors + - any_keywords: ["NCCL", "nccl"] + exception: DistributedError + error_details: "An NCCL (NVIDIA Collective Communications Library) error occurred during GPU-to-GPU communication. NCCL is used to synchronize data between GPUs during distributed training. Common causes include: 1) Network connectivity issues between GPU nodes, 2) GPU hardware problems, 3) Incompatible NCCL versions, or 4) Memory pressure on GPUs. This may be a transient issue - please try running your training job again. If the problem persists, contact your administrator." + + # c10d errors + - contains: "c10d" + exception: DistributedError + error_details: "A PyTorch distributed communication error occurred (c10d is PyTorch's distributed communication backend). This indicates a failure in the inter-process or inter-node communication during distributed training. This may be caused by network issues, process crashes, or resource exhaustion. Please try running your training job again. If the problem persists, contact your administrator." + + # --- NeMo-RL --- + # Not enough GPUs + - and: + - type_name: ResourceInsufficientError + - contains: "Not enough GPUs available" + exception: DistributedError + error_details: "The training cluster does not have enough GPUs available for your requested configuration. Your training job requires more GPUs than are currently available in the cluster. Try reducing the parallelism settings (tensor_parallel_size, pipeline_parallel_size) to require fewer GPUs." + + # Not enough CPUs + - and: + - type_name: ResourceInsufficientError + - contains: "Not enough CPUs available" + exception: DistributedError + error_details: "The training cluster does not have enough CPUs available for your requested configuration. CPUs are needed for data loading and preprocessing alongside GPU training." + + # Maximum retries reached + - and: + - type_name: ResourceInsufficientError + - contains: "Maximum number of retries reached" + exception: DistributedError + error_details: "Failed to allocate cluster resources after multiple retry attempts. This is typically a transient issue - please wait a few minutes and try submitting your training job again. If the problem persists, contact your administrator to check cluster health." + + # Placement group timeout + - contains: "Timed out waiting for placement groups to be ready" + exception: DistributedError + error_details: "Timed out while waiting for Ray placement groups to be allocated. Placement groups are used to co-locate GPU workers on the same nodes for efficient communication. This typically happens when the cluster is under heavy load and cannot allocate the required resources in time. Please try submitting your training job again. If the problem persists, contact your administrator." + + # No valid placement groups + - contains: "No valid placement groups found" + exception: DistributedError + error_details: "No valid Ray placement groups could be found for the training job. This indicates a problem with the distributed training cluster configuration or resource availability. This is an infrastructure issue - please contact your administrator to investigate the cluster setup." + + # Workers per node mismatch + - regex: "^workers_per_node list length \\(\\d+\\) must match" + exception: DistributedError + error_details: "The workers-per-node configuration does not match the number of placement groups allocated. This indicates an internal mismatch in the distributed training setup. This is an infrastructure issue - please contact your administrator." + + # Missing sharding annotations + - exact: "Sharding annotations must be provided to use sharded data distribution" + exception: DistributedError + error_details: "The training configuration requires sharded data distribution but sharding annotations are not provided. Sharding annotations specify how data should be distributed across workers for efficient parallel processing. This is an internal configuration issue - please contact your administrator." + + # =========================================================================== + # 13. GENERATION ERRORS (500) + # NeMo-RL + # =========================================================================== + + # Weight update failed during refit + - regex: "^Updating weights for the generation policy failed during refit" + exception: GenerationError + error_details: "Failed to update the vLLM generation model weights from the training policy during the 'refit' step. In GRPO training, the generation model periodically syncs weights from the training model. This failure may be caused by: 1) CUDA IPC (Inter-Process Communication) issues between training and generation workers, 2) NCCL communication errors, or 3) Memory pressure on GPUs. This is typically a transient issue - please try running your training job again." + + # generate_text with async_engine + - contains: "generate_text cannot be used with async_engine=True" + exception: GenerationError + error_details: "A synchronous generation method was called on an async vLLM engine. When async_engine is enabled, you must use async methods (e.g., generate_text_async). This is an internal configuration issue with the training environment, please contact the administrator." + + # update_weights_via_ipc with async_engine + - contains: "cannot be used with async_engine=True" + exception: GenerationError + error_details: "A synchronous method was called on an async vLLM engine. When async_engine is enabled, all vLLM operations must use their async variants. This is an internal configuration issue with the training environment, please contact the administrator." + + # Error in sample rollout + - regex: "^Error in sample \\d+ rollout: .+$" + exception: GenerationError + error_details: "An error occurred while generating a response (rollout) for one of the training samples during GRPO training. Rollouts are the model-generated responses used to compute rewards and policy gradients. This may be caused by: 1) Invalid input data in the sample, 2) Generation parameters causing issues (e.g., max_tokens too low), or 3) vLLM backend errors. Check your dataset for problematic samples." + + # Async generation not enabled + - contains: "Async generation is not enabled" + exception: GenerationError + error_details: "Async generation was requested but the vLLM engine is not configured with async_engine=True. Async generation allows overlapping training and generation for better throughput. This is an internal configuration issue with the training environment, please contact the administrator." + + # NeMo-Gym/Penguin requires async vLLM + - contains: "you must use vllm generation backend with" + exception: GenerationError + error_details: "The NeMo-Gym (Penguin) environment requires the vLLM generation backend with async_engine enabled. NeMo-Gym provides advanced RL training features that depend on async generation. This is an internal configuration issue with the training environment, please contact the administrator." + + # NeMo-Gym/Penguin requires HTTP server + - contains: "expose the vllm server via" + exception: GenerationError + error_details: "The NeMo-Gym (Penguin) environment requires the vLLM server to be exposed via HTTP (expose_http_server: true). This allows the environment to communicate with the generation model through an HTTP API. This is an internal configuration issue with the training environment, please contact the administrator." + + # NeMo-Gym/Penguin incompatible with reasoning parser + - contains: "Please do not use a reasoning parser in vLLM" + exception: GenerationError + error_details: "The NeMo-Gym (Penguin) environment is incompatible with vLLM's reasoning parser. NeMo-Gym handles all data processing including reasoning traces itself, so having a reasoning parser in vLLM would cause conflicts. This is an internal configuration issue with the training environment, please contact the administrator." + + # No placement groups available for vLLM + - exact: "No placement groups available in the cluster" + exception: GenerationError + error_details: "No Ray placement groups are available for vLLM generation workers. This means the cluster could not allocate the required GPU resources for the generation component of training. This is typically a resource availability issue - please try again or contact your administrator to check cluster capacity." + + # Unable to allocate vLLM worker groups + - contains: "Unable to allocate any worker groups with the available resources" + exception: GenerationError + error_details: "Could not allocate any vLLM worker groups with the available cluster resources. The generation component of DPO/GRPO training requires dedicated GPU resources for vLLM inference workers. Please ensure the cluster has enough GPUs, or reduce the generation parallelism settings." + + # Placement group contains no bundles + - exact: "Placement group contains no bundles" + exception: GenerationError + error_details: "A Ray placement group allocated for vLLM generation workers contains no resource bundles. This indicates an issue with cluster resource allocation. This is an infrastructure issue - please contact your administrator." + + # Failed to retrieve bundle/node mapping from placement group + - contains: "Failed to retrieve bundle/node mapping from placement group" + exception: GenerationError + error_details: "Could not retrieve the bundle-to-node mapping from the Ray placement group for vLLM workers. This indicates an issue with the distributed training cluster setup. This is an infrastructure issue - please contact your administrator." + + # No output received for generation request + - regex: "^No output received for request .+$" + exception: GenerationError + error_details: "The vLLM async generation engine did not produce any output for a generation request. This can happen when: 1) The generation request timed out, 2) The vLLM worker encountered an internal error, or 3) GPU memory was exhausted during generation. This is typically a transient issue - please try running your training job again." + + # =========================================================================== + # 14. INTERNAL ERRORS (500) + # =========================================================================== + + # --- Automodel Pipeline Parallelism Errors --- + # Pipeline parallelism: first stage missing inputs + - exact: "You must provide either input_ids or inputs_embeds" + exception: InternalError + error_details: "Pipeline parallelism internal error: the first pipeline stage did not receive input data (input_ids or inputs_embeds). This is an internal configuration issue with how the model is split across pipeline stages, please reach out to the NeMo Platform team." + + # Pipeline parallelism: intermediate stage missing embeddings + - exact: "inputs_embeds must be provided for pipeline stages without embed_tokens" + exception: InternalError + error_details: "Pipeline parallelism internal error: an intermediate pipeline stage did not receive embeddings from the previous stage. In pipeline parallelism, each stage processes a portion of the model layers and passes activations to the next stage. This error indicates the inter-stage communication failed, and is an internal training configuration issue, please reach out to the NeMo Platform team." + + # --- Automodel MoE (Mixture of Experts) Errors --- + # MoE: only 1D mesh supported (occurs when TP+EP are both > 1) + - exact: "We only support 1D mesh for MoE" + exception: ParallelismConfigError + error_details: "MoE (Mixture of Experts) models do not support combining tensor parallelism with expert parallelism. When using expert_model_parallel_size > 1, you must set tensor_parallel_size=1. Please update your parallelism configuration to disable tensor parallelism for MoE training." + + # MoE: DTensor placement error (checkpoint/parallelism mismatch) + - contains: "has unsupported DTensor placement" + exception: ParallelismConfigError + error_details: "MoE (Mixture of Experts) model checkpoint has an incompatible tensor distribution for the current expert parallelism settings. This typically occurs when the base model checkpoint was saved with different expert_model_parallel_size than what you're using for training. Please ensure your expert_model_parallel_size matches how the base model was originally distributed, or use a checkpoint that was saved without expert parallelism (expert_model_parallel_size=1)." + + # --- Automodel Fused Optimization Errors --- + # FusedLinearCrossEntropy configuration + - contains: "FusedLinearCrossEntropy requires the model to output hidden states" + exception: InternalError + error_details: "The fused linear cross-entropy optimization requires the model to output hidden states, but the model is configured to only output logits. FusedLinearCrossEntropy is a memory optimization that combines the final linear projection and loss computation. This is an internal configuration issue, contact the NeMo Platform team." + + # --- NeMo-RL Async GRPO Errors --- + # Stale trajectories in replay buffer + - regex: "^Found \\d+ trajectories older than min_valid_version \\d+$" + exception: InternalError + error_details: "The async GRPO replay buffer contains stale trajectories that are older than the minimum valid version. In async GRPO, trajectories are generated asynchronously and stored in a replay buffer. Stale trajectories can cause training instability because they were generated by an outdated policy. This indicates a synchronization issue between generation and training workers. Please contact the administrator." + + # --- NeMo-RL Tensor Processing Errors --- + # Tensor dimension mismatch + - regex: "^tensors for .+ must have same number of dimensions" + exception: InternalError + error_details: "Tensors being processed have mismatched dimensions during internal batching. This is an internal data processing issue that should not occur with valid datasets. Please contact the NeMo Platform team with your dataset format details." + + # Tensor dtype mismatch + - contains: "expected consistent types but got:" + exception: InternalError + error_details: "Tensors being processed have inconsistent data types (dtypes) during internal batching. This is an internal data processing issue that should not occur with valid datasets. Please contact the NeMo Platform team." + + # Tensors on different devices + - contains: "expected tensors on the same device but got:" + exception: InternalError + error_details: "Tensors are located on different devices during internal processing. This is an internal distributed training issue. Please contact the NeMo Platform team." + + # --- Automodel Configuration Errors --- + # Config instantiation failure (from ConfigNode.instantiate()) + # This prints a detailed error with "Instantiation failed for `func_name`" + - contains: "Instantiation failed for" + exception: InternalError + error_details: "Failed to instantiate a training configuration component. The training system uses a configuration tree where each node can instantiate Python objects (like optimizers, schedulers, or model components). This error means one of these instantiations failed, possibly due to invalid parameters or missing dependencies. Please contact the administrator." + + # Model compilation failure + - contains: "Model compilation failed" + exception: InternalError + error_details: "PyTorch model compilation (torch.compile) failed. Model compilation is an optional optimization that can speed up training by compiling the model graph. Training will fall back to eager mode and continue without compilation. If this error persists, it may indicate an incompatibility between the model architecture and PyTorch's compiler. Please contact the administrator if training fails." + + # --- General Training Process Errors --- + # Training subprocess error (generic fallback when no specific error was parsed) + # Matches both parser ("Training failed with exit code: X") and train.py ("Training subprocess returned with error code: X") + - regex: "^Training (failed with exit code|subprocess returned with (?:error )?code):? \\d+.*" + exception: InternalError + error_details: "The training process exited with a non-zero exit code, but no specific error message could be extracted from the training logs. This is a generic failure that can have many causes. Please check the full training logs for more details, and contact the administrator if you cannot determine the cause." + diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/exceptions.py b/services/automodel/src/nmp/automodel/tasks/training/errors/exceptions.py new file mode 100644 index 00000000..6a985270 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/errors/exceptions.py @@ -0,0 +1,431 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Custom exceptions for Customizer training errors. + +These exceptions provide user-friendly error messages for errors that may occur +during training with various backends: +- Automodel +- NeMo-RL +- Megatron Bridge +""" + +from dataclasses import dataclass +from typing import TypedDict + + +def format_exception_string(exc: BaseException) -> str: + """Format an exception as ``TypeName: message`` matching Python's traceback style. + + This is the canonical format used throughout the error-handling pipeline: + - ``ray_bootstrap`` writes it into the driver output buffer so the parser + can extract exceptions that occurred outside the subprocess. + - ``default_exception_handler`` uses it for the ``detail`` field reported + to the Jobs service. + - The parser's ``_EXCEPTION_RE`` regex is designed to match this format + when reading subprocess output. + """ + return f"{type(exc).__name__}: {exc}" + + +class ErrorDetails(TypedDict): + """Error details dict for Jobs service reporting.""" + + message: str + type: str + detail: str | None + + +@dataclass +class CustomizerTrainingError(Exception): + """ + Base exception for Customizer training errors. + + Attributes: + message: User-friendly error message shown to the user. + detail: Technical details about the original error (for debugging). + user_message: Class-level default message used as fallback when the YAML rule + does not specify an `error_details` field. Subclasses override this. + """ + + message: str + detail: str | None = None + + # Default user-facing message - subclasses override this. + # Used as fallback when YAML rule omits `error_details` field. + # See default_exception_handler() for usage. + user_message: str = "An error occurred during training." + + def __post_init__(self): + # Call Exception.__init__ with the message + super().__init__(self.message) + + def __str__(self) -> str: + return self.message + + def to_error_details(self) -> ErrorDetails: + """Convert to error_details dict for Jobs service reporting.""" + return ErrorDetails( + message=self.message, + type=type(self).__name__, + detail=self.detail, + ) + + +# ============================================================================= +# CLIENT ERRORS (400) +# ============================================================================= + + +@dataclass +class DatasetFormatError(CustomizerTrainingError): + """ + Dataset has invalid format or schema. + + Raised when: + - Dataset sample has unsupported role (not system/user/assistant/tool) + - Dataset is empty or has zero valid samples + - Text input is not a string or list of strings + - Required field missing from dataset sample + - Prompt file does not exist + """ + + user_message: str = "Dataset format error. Please check your dataset matches the expected schema." + + +@dataclass +class TrainingConfigError(CustomizerTrainingError): + """ + Invalid training configuration. + + Raised when: + - Model incompatible with pipeline parallelism (tied embeddings, encoder-decoder) + - PP batch/microbatch configuration invalid + - Model doesn't support SDPA for context parallelism + - Triton not installed for optimized LoRA kernels + - LoRA adapter dimensions mismatch + - DPO with dynamic batching or sequence packing + - GRPO missing generation config or validation dataset + - Async GRPO configuration errors + - Batch size not divisible by data parallel size + - World size insufficient for parallelism configuration + """ + + user_message: str = ( + "Training configuration error. Please check your parallelism settings " + "(tensor_parallel_size, pipeline_parallel_size, expert_model_parallel_size), " + "batch settings (batch_size, micro_batch_size), or training type configuration." + ) + + +@dataclass +class TrainingEnvironmentError(CustomizerTrainingError): + """ + Invalid environment configuration for GRPO. + + Raised when: + - GRPO environment name is not recognized + - GRPO environment not configured + - No environment found for task type + """ + + user_message: str = "Environment configuration error. Please check your GRPO environment settings." + + +@dataclass +class ParallelismConfigError(CustomizerTrainingError): + """ + Invalid parallelism configuration for MoE models. + + Raised when: + - MoE model uses tensor parallelism with expert parallelism (only 1D mesh supported) + - DTensor placement incompatible with expert parallelism settings + - Checkpoint parallelism settings don't match training configuration + """ + + user_message: str = ( + "Parallelism configuration error for Mixture-of-Experts (MoE) model. " + "MoE models do not support combining tensor_parallel_size > 1 with expert_model_parallel_size > 1. " + "To fix: either set tensor_parallel_size=1 when using expert parallelism, " + "or set expert_model_parallel_size=1 when using tensor parallelism." + ) + + +# ============================================================================= +# NOT FOUND ERRORS (404) +# ============================================================================= + + +@dataclass +class ModelNotFoundError(CustomizerTrainingError): + """ + Model or checkpoint path doesn't exist. + + Raised when: + - The specified checkpoint path does not exist + - The checkpoint directory is empty when resuming + - Nemotron model missing required HF source code + """ + + user_message: str = ( + "Model or checkpoint not found. The specified model path does not exist or is inaccessible. " + "Please verify the model identifier is correct and the model was successfully downloaded." + ) + + +# ============================================================================= +# SERVER ERRORS (500) +# ============================================================================= + + +@dataclass +class ModelLoadError(CustomizerTrainingError): + """ + Failed to load or initialize model. + + Raised when: + - Model weights could not be applied to a layer (corruption) + - Model optimizations/patches failed + - Method signature mismatch during patching + - Missing lm_head.weight in model + - vLLM library not installed + - Shape mismatch for model parameters or buffers + - Generation output missing required fields + """ + + user_message: str = ( + "Failed to load the model. This can happen when: " + "1) The model checkpoint is corrupted or incomplete, " + "2) The model architecture is incompatible with the training configuration, " + "3) There is a version mismatch between the model and the training framework. " + "Please verify the model checkpoint is valid and complete." + ) + + +@dataclass +class CheckpointError(CustomizerTrainingError): + """ + Checkpoint save or load failure. + + Raised when: + - Checkpoint directory already exists + - Failed to validate global plan (distributed checkpoint corruption) + - Missing key in checkpoint state_dict + - Expert weights missing from MoE checkpoint + - Training interrupted during checkpoint save + - Parallelism settings don't match checkpoint + - Model export or upload failed + """ + + user_message: str = ( + "Checkpoint save or load failed. This can happen when: " + "1) The checkpoint is corrupted or was saved incompletely (e.g., training was interrupted), " + "2) Disk space is insufficient for saving checkpoints, " + "3) The base model checkpoint is incompatible with the current training configuration." + ) + + +@dataclass +class CudaError(CustomizerTrainingError): + """ + GPU/CUDA runtime error. + + Raised when: + - GPU out of memory (OOM) + - General CUDA runtime errors + """ + + user_message: str = ( + "GPU memory exhausted. To reduce memory usage: " + "1) Reduce batch_size or micro_batch_size, " + "2) Reduce max_seq_length, " + "3) Use LoRA fine-tuning instead of full fine-tuning, " + "4) Increase tensor_parallel_size to distribute the model across more GPUs." + ) + + +@dataclass +class DistributedError(CustomizerTrainingError): + """ + Distributed training or Ray cluster failure. + + Raised when: + - torch.distributed not available + - torch.distributed not initialized + - Distributed operation timeout + - NCCL communication errors + - Ray cluster resource insufficiency + - Placement group allocation failure + """ + + user_message: str = "Distributed training error. Please check cluster resources and try again." + + +@dataclass +class GenerationError(CustomizerTrainingError): + """ + vLLM generation/inference failure. + + Raised when: + - Failed to update vLLM weights from training policy + - Sync method called on async engine + - Error during rollout for a sample + - Async generation called without async engine + - Penguin requires async vLLM + """ + + user_message: str = ( + "Generation error during reinforcement learning training. " + "DPO and GRPO training generate model responses during the training loop to compute rewards. " + "This error indicates the generation step failed, which may be caused by vLLM backend issues " + "or incompatible generation settings." + ) + + +@dataclass +class TrainingTimeoutError(CustomizerTrainingError): + """ + Training exceeded time limit. + + Raised when: + - Training subprocess exceeded configured timeout + """ + + user_message: str = ( + "Training exceeded the maximum allowed time limit. " + "To reduce training time: reduce epochs or max_steps, use a smaller dataset, " + "use a smaller model, or use LoRA fine-tuning instead of full fine-tuning. " + "Contact your administrator if you need longer training time limits." + ) + + +@dataclass +class InternalError(CustomizerTrainingError): + """ + Unexpected internal error. + + Raised when: + - Pipeline stage missing input_ids or inputs_embeds + - MoE device mesh configuration error + - DTensor placement error for expert parallelism + - FusedLinearCrossEntropy configuration error + - Tensor dimension/dtype/device mismatch + - Logger misconfiguration + - Any unmatched error (fallback) + """ + + user_message: str = ( + "An unexpected internal error occurred during training. " + "This is typically caused by framework-level issues such as tensor misconfigurations, " + "device mesh errors, or internal pipeline failures. " + "Please try running your job again. If the issue persists, contact your administrator " + "with the job ID and error details for further investigation." + ) + + +@dataclass +class GenericTrainingError(CustomizerTrainingError): + """ + Fallback when error classification is ambiguous. + + Used when multiple error rules match the same exception, + making classification unreliable. + """ + + user_message: str = ( + "Training failed due to an error that could not be precisely categorized. " + "Please review the error details for more information. " + "If the issue persists, try adjusting your training configuration." + ) + + +# ============================================================================= +# EXCEPTION REGISTRY +# ============================================================================= + +# Maps exception class names (strings in YAML) to actual Python classes +EXCEPTION_REGISTRY: dict[str, type[Exception]] = { + # Base + "CustomizerTrainingError": CustomizerTrainingError, + # Client errors (400) + "DatasetFormatError": DatasetFormatError, + "TrainingConfigError": TrainingConfigError, + "TrainingEnvironmentError": TrainingEnvironmentError, + "ParallelismConfigError": ParallelismConfigError, + # Not found (404) + "ModelNotFoundError": ModelNotFoundError, + # Server errors (500) + "ModelLoadError": ModelLoadError, + "CheckpointError": CheckpointError, + "CudaError": CudaError, + "DistributedError": DistributedError, + "GenerationError": GenerationError, + "TrainingTimeoutError": TrainingTimeoutError, + "InternalError": InternalError, + "GenericTrainingError": GenericTrainingError, +} + + +# ============================================================================= +# DEFAULT EXCEPTION HANDLER +# ============================================================================= + + +def default_exception_handler( + exception_class: type[Exception], + original_exception: Exception, + error_details: str | None, +) -> Exception: + """ + Default handler for creating Customizer training exceptions. + + This handler is used by RulesLoader when: + 1. A rule matches but doesn't have a custom handler + 2. No rule matches and fallback_exception is set + + Args: + exception_class: The exception class to create (from EXCEPTION_REGISTRY) + original_exception: The original exception that was caught + error_details: User-friendly message from the rule's error_details field, + or None if not specified + + Returns: + A new instance of exception_class with appropriate message and detail + """ + # Get the default user message from the class if no error_details provided + if issubclass(exception_class, CustomizerTrainingError): + user_message = error_details or exception_class.user_message + # For InternalError fallback (no matching rule), include the original error + # in the message so users get actionable information instead of a vague message + if exception_class is InternalError and error_details is None: + user_message = f"{user_message} ({format_exception_string(original_exception)})" + return exception_class( + message=user_message, + detail=format_exception_string(original_exception), + ) + else: + # For non-CustomizerTrainingError classes (shouldn't happen, but be safe) + return exception_class(error_details or str(original_exception)) + + +__all__ = [ + "CheckpointError", + "CudaError", + "CustomizerTrainingError", + "DatasetFormatError", + "DistributedError", + "ErrorDetails", + "EXCEPTION_REGISTRY", + "format_exception_string", + "GenerationError", + "GenericTrainingError", + "InternalError", + "ModelLoadError", + "ModelNotFoundError", + "ParallelismConfigError", + "TrainingConfigError", + "TrainingEnvironmentError", + "TrainingTimeoutError", + "default_exception_handler", +] diff --git a/services/automodel/src/nmp/automodel/tasks/training/errors/parser.py b/services/automodel/src/nmp/automodel/tasks/training/errors/parser.py new file mode 100644 index 00000000..e0a4ee21 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/errors/parser.py @@ -0,0 +1,255 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Error parser for subprocess output. + +This module provides utilities to parse and extract meaningful error messages from +training subprocess output (stdout/stderr). It should be used by all training backends +(Automodel, NeMo-RL, Megatron Bridge) to capture errors for classification. + +The extracted error messages are then matched against YAML rules by the +error converter to produce user-friendly error messages. +""" + +import re +import subprocess +import sys +from collections import deque +from dataclasses import dataclass + +# Number of recent output lines to keep for error parsing +MAX_OUTPUT_LINES = 500 + +# Patterns that indicate an error line (case-insensitive search) +# These match Python exception types and common error patterns from training libraries +ERROR_INDICATORS = [ + # Python exception type names (appear as "ExceptionType: message") + "runtimeerror", + "valueerror", + "assertionerror", + "importerror", + "attributeerror", + "keyerror", + "typeerror", + "filenotfounderror", + "permissionerror", + "oserror", + "ioerror", + # Generic error patterns + "error:", + "exception:", + "traceback", + # Automodel-specific patterns + "instantiation failed", # From ConfigNode.instantiate() + "model compilation failed", # From compile_utils.py + # NeMo-RL patterns + "ray error", + "actor died", + "worker crashed", + # Megatron Bridge patterns + "nemo error", + "lightning error", + # CUDA/GPU patterns + "cuda out of memory", + "out of memory", + "oom", + "cuda error", + "cublas error", + "cudnn error", + # Distributed training patterns + "nccl", + "gloo", + "distributed", + "mpi error", + # General failure patterns + "failed", + "failure", + "abort", + "killed", + "segmentation fault", + "signal", +] + +# Regex to detect Python exception lines ("SomeError: message") and extract +# the type name (group 1) and message (group 2) as separate captures. +_EXCEPTION_RE = re.compile( + r"\b(\w*(?:Error|Exception)):\s*(.*)", + re.IGNORECASE, +) + +# Wrapper exceptions from distributed training - skip these to find root cause +WRAPPER_EXCEPTION_PATTERNS = [ + "childfailederror", # torch.distributed wrapper + "torch.distributed.elastic", # torch elastic wrapper + "multiprocessing.errors", # multiprocessing wrapper +] + + +@dataclass(frozen=True) +class ParsedError: + """Error extracted from subprocess output. + + Preserves both the original exception type name (as printed in the + traceback) and the message, so callers can reconstruct a typed + exception for the converter's type-based matchers. + """ + + exception_type: str + message: str + + def to_exception(self) -> Exception: + """Reconstruct an exception that preserves the original type name. + + Dynamically creates an exception class whose ``__name__`` matches + the original type (e.g. ``ValueError``, ``ResourceInsufficientError``) + so that ``type_name`` YAML matchers can match it. The class inherits + from ``RuntimeError`` so that standard ``except Exception`` handling + works without needing the real library class to be importable. + """ + exc_class = type(self.exception_type, (RuntimeError,), {}) + return exc_class(self.message) + + +def _clean_line(line: str) -> str: + """Remove common prefixes like [rank0]: from distributed output.""" + line = re.sub(r"^\[rank\d+\]:\s*", "", line.strip()) + return line.strip() + + +def _is_wrapper_exception(line: str) -> bool: + """Check if this is a wrapper exception that should be skipped.""" + line_lower = line.lower() + return any(pattern in line_lower for pattern in WRAPPER_EXCEPTION_PATTERNS) + + +def _extract_exception(line: str) -> ParsedError | None: + """ + Extract the exception type and message from a subprocess output line. + + Examples: + >>> _extract_exception("[rank0]: ValueError: invalid input") + ParsedError(exception_type='ValueError', message='invalid input') + >>> _extract_exception("torch.cuda.OutOfMemoryError: CUDA OOM") + ParsedError(exception_type='OutOfMemoryError', message='CUDA OOM') + >>> _extract_exception(" File 'train.py', line 42") + None + >>> _extract_exception("ChildFailedError: worker 0 failed") + None # Wrapper exception, skipped + + Returns None for non-exception lines and wrapper exceptions. + """ + if _is_wrapper_exception(line): + return None + + match = _EXCEPTION_RE.search(line) + if match: + exc_type = match.group(1).strip() + message = match.group(2).strip() if match.group(2) else "" + return ParsedError( + exception_type=exc_type, + message=message or exc_type, + ) + + return None + + +def parse_error_from_output(output_lines: deque, returncode: int) -> ParsedError: + """ + Parse subprocess output and extract a structured error. + + Searches the captured output for Python exception lines and returns a + ``ParsedError`` preserving both the exception type name and message. + Callers use ``result.to_exception()`` to reconstruct a typed exception + that works with both message-based *and* type-based YAML matchers. + + Strategy: + 1. Find the LAST Python exception line (e.g., "ValueError: message") + 2. Extract the type name and message separately + 3. Deduplicate across distributed ranks + + Args: + output_lines: Rolling buffer of recent output lines. + returncode: Process exit code. + + Returns: + ParsedError with exception_type and message. + """ + if not output_lines: + return ParsedError("RuntimeError", f"Training failed with exit code: {returncode}") + + lines = list(output_lines) + + # Search backwards for exception lines and collect unique ones + # (distributed training often prints the same error multiple times) + found: list[ParsedError] = [] + seen_messages: set[str] = set() + + for i in range(len(lines) - 1, -1, -1): + parsed = _extract_exception(lines[i]) + if parsed and parsed.message not in seen_messages: + seen_messages.add(parsed.message) + found.append(parsed) + if len(found) >= 3: + break + + if found: + return found[0] + + # Fallback: search for any error-related lines + error_lines: list[str] = [] + for line in reversed(lines): + line_lower = line.lower() + is_error_line = any(indicator in line_lower for indicator in ERROR_INDICATORS) + if is_error_line: + cleaned = _clean_line(line) + if cleaned and cleaned not in error_lines: + error_lines.insert(0, cleaned) + if len(error_lines) > 10: + break + + if error_lines: + return ParsedError("RuntimeError", "\n".join(error_lines[-10:])) + + # Last resort: return last N lines of output + last_lines = [_clean_line(line) for line in lines[-10:]] + message = f"Training failed with exit code {returncode}. Last output:\n" + "\n".join(last_lines) + return ParsedError("RuntimeError", message) + + +def read_subprocess_output(proc: subprocess.Popen, buffer: deque) -> None: + """ + Read subprocess output, stream to console, and capture in buffer. + + This function is designed to run in a daemon thread alongside a subprocess, + reading its stdout line-by-line, printing to console in real-time, and + storing lines in a rolling buffer for later error extraction. + + Args: + proc: The subprocess.Popen object with stdout=PIPE. + buffer: A deque with maxlen to store recent output lines. + """ + if proc.stdout is None: + return + + try: + for line in iter(proc.stdout.readline, ""): + if not line: + break + # Stream to console + sys.stdout.write(line) + sys.stdout.flush() + # Capture in rolling buffer + buffer.append(line.rstrip("\n")) + except (ValueError, OSError): + # Process closed or pipe broken + pass + + +__all__ = [ + "ERROR_INDICATORS", + "MAX_OUTPUT_LINES", + "ParsedError", + "parse_error_from_output", + "read_subprocess_output", +] diff --git a/services/automodel/src/nmp/automodel/tasks/training/integrations.py b/services/automodel/src/nmp/automodel/tasks/training/integrations.py new file mode 100644 index 00000000..f3610c9d --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/integrations.py @@ -0,0 +1,168 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. + +"""WandB and MLflow config helpers for Automodel training.""" + +import logging +import os +from pathlib import Path +from typing import Any + +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.tasks.training.schemas import TrainingStepConfig + +logger = logging.getLogger(__name__) + + +def _resolve_with_fallback( + primary: str | None, + fallback: str | None, + default: str, + field_label: str | None = None, +) -> str: + """Pick the first truthy value from *primary* → *fallback* → *default*. + + When *field_label* is given and neither *primary* nor *fallback* is set, + a warning is logged so operators know a hardcoded default is in use. + """ + if field_label and not (primary or fallback): + logger.warning(f"{field_label} is not set; using fallback '{default}'.") + return primary or fallback or default + + +def build_mlflow_config( + customizer_config: TrainingStepConfig, + job_ctx: NMPJobContext, + framework: str, +) -> dict[str, Any] | None: + """Build MLflow config for Automodel training. + The resulting dict is passed to MLflow logging setup in the recipe config. + + Run naming strategy (same as WandB): + - run_name uses job_id (stable across pause/resume) + - task_id is added to tags for granular execution tracking + + Missing tracking URI disables integration with a warning. + """ + user_config = customizer_config.integrations.mlflow + if not user_config: + return None + + # User-provided tracking URI takes precedence over environment variable + tracking_uri = user_config.tracking_uri or os.environ.get("MLFLOW_TRACKING_URI") + if not tracking_uri: + logger.warning( + "MLflow integration is configured but no tracking URI is set " + "(MLFLOW_TRACKING_URI env var and integrations.mlflow.tracking_uri in job POST request are empty); " + "MLflow integration will be disabled." + ) + return None + + tags: dict[str, str] = { + "service": "customizer", + "framework": framework, + } + if job_ctx.workspace: + tags["workspace"] = job_ctx.workspace + if job_ctx.job_id: + tags["job"] = job_ctx.job_id + if job_ctx.task: + tags["task"] = job_ctx.task + if customizer_config.model.name: + tags["model_name"] = customizer_config.model.name + + # User-provided tags override defaults above + if user_config.tags: + tags.update(user_config.tags) + if user_config.description: + # MLflow run description is stored in the reserved `mlflow.note.content` tag. + # See: https://mlflow.org/docs/latest/ml/tracking/#how-to-include-additional-description-texts-about-the-run + tags["mlflow.note.content"] = user_config.description + + experiment_name = _resolve_with_fallback( + user_config.experiment_name, + customizer_config.output_model, + "default-experiment", + field_label="MLflow experiment_name", + ) + run_name = _resolve_with_fallback( + user_config.run_name, + job_ctx.job_id, + "default-run", + field_label="MLflow run_name", + ) + + mlflow_config: dict[str, Any] = { + "tracking_uri": tracking_uri, + "experiment_name": experiment_name, + "run_name": run_name, + "tags": tags, + } + + return mlflow_config + + +def build_wandb_config( + customizer_config: TrainingStepConfig, + job_ctx: NMPJobContext, + framework: str, +) -> dict[str, Any] | None: + """Build WandB config for Automodel training. + + The resulting dict is passed to wandb.init() as kwargs by automodel. + See: https://docs.wandb.ai/ref/python/init + + TODO: Add pause/resume support: + - 'name' and 'id' use job_id (stable across pause/resume) + - 'resume="allow"' enables continuing runs after pause/resume + """ + user_config = customizer_config.integrations.wandb + if not user_config: + return None + + wandb_api_key = os.environ.get("WANDB_API_KEY") + if not user_config.base_url and not wandb_api_key: + logger.warning("WandB API key is not set and no base_url is provided, skipping WandB integration") + return None + + # Note: This is semantically different from job_ctx.workspace. + # This is the workspace for training artifacts. + run_dir = Path(customizer_config.workspace_path) / "wandb" + + tags: list[str] = ["service:customizer", f"framework:{framework}"] + if job_ctx.workspace: + tags.append(f"workspace:{job_ctx.workspace}") + if job_ctx.job_id: + tags.append(f"job:{job_ctx.job_id}") + if job_ctx.task: + tags.append(f"task:{job_ctx.task}") + if customizer_config.model.name: + tags.append(f"model:{customizer_config.model.name}") + # User-provided tags are appended (can override tags above) + if user_config.tags: + tags.extend(user_config.tags) + + wandb_config: dict[str, Any] = { + "project": _resolve_with_fallback(user_config.project, customizer_config.output_model, "default-project"), + "name": _resolve_with_fallback(user_config.name, job_ctx.job_id, "default-run"), + "dir": str(run_dir), + "tags": tags, + } + if user_config.entity: + wandb_config["entity"] = user_config.entity + if user_config.notes: + wandb_config["notes"] = user_config.notes + if user_config.base_url: + # For self-hosted W&B servers, base_url is passed via the settings dict + # (wandb.init accepts settings as Union[Settings, Dict[str, Any], None]). + logger.info(f"Using self-hosted W&B server: {user_config.base_url}") + wandb_config["settings"] = {"base_url": user_config.base_url} + + return wandb_config diff --git a/services/automodel/src/nmp/automodel/tasks/training/model_utils/constants.py b/services/automodel/src/nmp/automodel/tasks/training/model_utils/constants.py new file mode 100644 index 00000000..c784016b --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/model_utils/constants.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +ADAPTER_FILES = ["adapter_config.json", "adapter_model.safetensors"] diff --git a/services/automodel/src/nmp/automodel/tasks/training/model_utils/file_utils.py b/services/automodel/src/nmp/automodel/tasks/training/model_utils/file_utils.py new file mode 100644 index 00000000..579fb555 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/model_utils/file_utils.py @@ -0,0 +1,172 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related +# documentation and any modifications thereto. Any use, reproduction, +# disclosure or distribution of this material and related documentation +# without an express license agreement from NVIDIA CORPORATION or +# its affiliates is strictly prohibited. +import logging +from enum import Enum +from pathlib import Path +from typing import Dict, List, Optional + +from nmp.automodel.tasks.training.model_utils.constants import ADAPTER_FILES + + +class TargetCheckpointType(str, Enum): + """Target checkpoint format types for model conversion.""" + + NEMO = "NEMO" + HF = "HF" + HF_LORA = "HF_LORA" + + +logger = logging.getLogger(__name__) + + +def get_flat_files_list(parent_dir: str) -> List[str]: + """ + Get a list of files in a directory + """ + parent_path = Path(parent_dir).resolve() + if not parent_path.exists(): + raise ValueError(f"Path {parent_dir} does not exist") + if not parent_path.is_dir(): + raise ValueError(f"Path {parent_dir} is not a directory") + + return [str(path) for path in parent_path.rglob("*") if path.is_file()] + + +def is_adapter_file_present(files: List[str]) -> bool: + """ + Check if the any file is a LoRA adapter file + """ + for file in files: + if not file: + continue + if any(adapter_file in file.lower() for adapter_file in ADAPTER_FILES): + return True + return False + + +def check_directory_structure(path: Path | str, target: Dict[str, Optional[Dict]]) -> bool: + if isinstance(path, str): + path = Path(path) + + if not path.is_dir(): + logger.error(f"Provided path '{path}' is not a directory") + return False + + try: + got_files = {f.name for f in path.iterdir()} + except OSError: + logger.exception("Cannot read directory '%s'", path) + return False + + expected_files = set(target.keys()) + missing = expected_files - got_files + if missing: + logger.debug(f"Mismatch in '{path}': Missing items -> {missing}") + return False + + for name, _target in target.items(): + current_path = path / name + if isinstance(_target, dict): + # this is a directory + if not current_path.is_dir(): + return False + if not check_directory_structure(current_path, _target): + return False + elif _target is None: + if not current_path.is_file(): + logger.debug(f"Mismatch: '{current_path}' is expected to be a file but is a directory.") + return False + return True + + +def is_nemo_model_directory(model_path: Path | str) -> bool: + nemo_structure = { + "context": {"nemo_tokenizer": {}, "model.yaml": None}, + "weights": {"metadata.json": None}, + } + return check_directory_structure(model_path, nemo_structure) + + +def is_huggingface_model_directory(model_path: Path | str) -> bool: + """ + Checks if a directory contains the necessary files to be considered a + Hugging Face model directory. + + Args: + directory_path: The path to the directory to check. + + Returns: + True if the directory contains a config.json file and model weights, + False otherwise. + """ + if isinstance(model_path, str): + model_path = Path(model_path) + + # 1. Check for the mandatory config.json file + config_file = model_path / "config.json" + if not config_file.is_file(): + logger.debug(f"Missing {config_file}") + return False + + tokenizer_files = [ + model_path / "tokenizer.json", + model_path / "tokenizer_config.json", + model_path / "vocab.txt", + model_path / "merges.txt", + ] + if not any(tf.is_file() for tf in tokenizer_files): + logger.debug(f"Missing any tokenizer file: at least one of [{tokenizer_files}] is required") + return False + + # 2. Check for the presence of model weight files (either safetensors or pytorch bin) + safe_tensor_file = model_path / "model.safetensors" + has_safetensors = safe_tensor_file.is_file() or any(model_path.glob("model-*.safetensors")) + if has_safetensors: + return True + + logger.debug(f"Missing model weights files in the form of {safe_tensor_file} or {model_path}/model-*.safetensors") + pytorch_bin_file = model_path / "pytorch_model.bin" + has_pytorch_bin = pytorch_bin_file.is_file() or any(model_path.glob("pytorch_model-*.bin")) + if has_pytorch_bin: + return True + + logger.debug(f"Missing model weights files in the form of {pytorch_bin_file} or {model_path}/pytorch_model-*.bin") + return False + + +def determine_llm_model_type(model_dir: str | Path) -> TargetCheckpointType | None: + """ + Determines whether a model directory contains a HuggingFace or NVIDIA NeMo model. + """ + model_path = Path(model_dir).resolve() + + if not model_path.exists() or not model_path.is_dir(): + logger.error(f"Provided path {model_path} is not a directory") + return None + + logger.debug(f"Checking model in {model_path} for LoRA adapter format indicators") + if is_adapter_file_present(get_flat_files_list(str(model_path))): + logger.info(f"Huggingface LoRA adapter format detected in {model_path}") + return TargetCheckpointType.HF_LORA + + logger.debug(f"Checking model in {model_path} for NeMo format indicators") + if is_nemo_model_directory(model_path): + logger.info(f"NeMo format detected in {model_path}") + return TargetCheckpointType.NEMO + + logger.debug(f"Checking model in {model_path} for HugginFace format indicators") + if is_huggingface_model_directory(model_path): + logger.info(f"HuggingFace format detected in {model_path}") + return TargetCheckpointType.HF + + logger.warning(f"model at {model_path} is an unknown checkpoint format") + logger.warning(f"File List: {get_flat_files_list(str(model_path))}") + + return None diff --git a/services/automodel/src/nmp/automodel/tasks/training/progress.py b/services/automodel/src/nmp/automodel/tasks/training/progress.py new file mode 100644 index 00000000..0ede0651 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/progress.py @@ -0,0 +1,173 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Progress reporting for training tasks. + +This module provides progress reporting to the Jobs service using +the NeMo Platform SDK. The `JobsServiceProgressReporter` class +handles high-level phase reporting for the training runner. + +For training-specific metrics (loss, validation, checkpoints), see +the `TrainingProgressCallback` in the automodel backend which composes +this reporter and provides training-specific methods. +""" + +import logging +import os +from typing import Any + +from nmp.automodel.app.constants import SERVICE_NAME +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.common.sdk_factory import get_task_sdk + +logger = logging.getLogger(__name__) + + +class JobsServiceProgressReporter: + """Reports high-level progress to the Jobs service. + + This class provides progress reporting for the training runner: + - configure_progress_tracking(max_steps, num_epochs) - Set bounds for percentage calculation + - report_running(phase, **details) - Report current phase (auto-calculates percentage_done) + - report_completed(message) - Report successful completion + - report_error(message) - Report failure + + For training backends that need to report detailed metrics, the + `update_task` method is exposed for direct use. See `TrainingProgressCallback` + in the automodel backend for an example. + """ + + def __init__(self, job_ctx: NMPJobContext): + """Initialize the progress reporter.""" + self._job_ctx = job_ctx + self._sdk = get_task_sdk(SERVICE_NAME) + self._is_main_rank = int(os.environ.get("RANK", "0")) == 0 + self._max_steps = 0 + self._num_epochs = 0 + + self._enabled = self._is_main_rank and all( + [self._job_ctx.job_id, self._job_ctx.step, self._job_ctx.normalized_task] + ) + + def configure_progress_tracking(self, max_steps: int, num_epochs: int) -> None: + """Configure progress tracking at the start of training. + + Args: + max_steps: Total number of training steps + num_epochs: Total number of epochs + """ + self._max_steps = max_steps + self._num_epochs = num_epochs + + def _calculate_percentage_done(self, step: int | None) -> int: + """Calculate percentage done based on current step and max_steps.""" + if step is None or self._max_steps <= 0: + return 0 + return int((step / self._max_steps) * 100) + + def update_task( + self, + status: str = "active", + status_details: dict[str, Any] | None = None, + error_details: dict[str, Any] | None = None, + ) -> None: + """Update task status via SDK. + + This is the low-level method exposed for composition by training + callbacks that need to report detailed metrics. + + Args: + status: Task status ("active", "completed", "error") + status_details: Details about the current status + error_details: Error information (for status="error") + """ + if not self._enabled: + return + + # Only report from rank 0 in distributed training + if not self._is_main_rank: + return + + try: + self._sdk.jobs.tasks.create_or_update( + name=self._job_ctx.normalized_task, + workspace=self._job_ctx.workspace, + job=self._job_ctx.job_id, + step=self._job_ctx.step, + status=status, + status_details=status_details or {}, + error_details=error_details or {}, + ) + except Exception as e: + logger.warning(f"Failed to update task progress: {e}") + + def fetch_current_metrics(self) -> dict[str, list[dict[str, float | int]]]: + """Fetch accumulated metrics from the server for the current task. + + Used to seed metric accumulators on startup so that metrics + survive pause/resume cycles. Returns empty lists on failure + or if no prior metrics exist. + """ + if not self._enabled: + return {"train_loss": [], "val_loss": []} + + try: + task = self._sdk.jobs.tasks.retrieve( + name=self._job_ctx.normalized_task, + workspace=self._job_ctx.workspace, + job=self._job_ctx.job_id, + step=self._job_ctx.step, + ) + metrics = (task.status_details or {}).get("metrics", {}) + return { + "train_loss": metrics.get("train_loss", []), + "val_loss": metrics.get("val_loss", []), + } + except Exception as e: + logger.info(f"No prior metrics to seed (expected on first run): {e}") + return {"train_loss": [], "val_loss": []} + + # --- High-level runner methods --- + + def report_running(self, phase: str, **details: Any) -> None: + """Report that a phase is running. + + If 'step' is provided and training schedule is set (via configure_progress_tracking), + percentage_done is automatically calculated unless explicitly provided. + + Args: + phase: The current phase (e.g., "compiling_config", "training") + **details: Additional context (e.g., step, epoch, loss, backend="automodel") + """ + # Auto-calculate percentage_done if step is provided and not already set + if "step" in details and "percentage_done" not in details and self._max_steps > 0: + details["percentage_done"] = self._calculate_percentage_done(details["step"]) + + status_details = {"phase": phase, **details} + self.update_task(status="active", status_details=status_details) + + def report_completed(self, message: str = "Completed") -> None: + """Report task completed successfully. + + Args: + message: Completion message + """ + self.update_task(status="completed", status_details={"message": message, "phase": "completed"}) + + def report_error(self, error: str | dict[str, Any]) -> None: + """Report task error. + + Args: + error: Error message (str) or error details dict with 'message', 'type', 'detail' keys. + The dict format is typically from create_error_details() in the errors module. + """ + if isinstance(error, str): + error_details = {"message": error} + else: + error_details = error + self.update_task(status="error", error_details=error_details) + + def close(self) -> None: + """Clean up SDK resources.""" + self._sdk.close() diff --git a/services/automodel/src/nmp/automodel/tasks/training/protocol.py b/services/automodel/src/nmp/automodel/tasks/training/protocol.py new file mode 100644 index 00000000..59045ca5 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/protocol.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +@dataclass +class LibraryConfig: + """nemo-automodel recipe config written by the training runner.""" + + config_dict: dict[str, Any] + config_path: Path diff --git a/services/automodel/src/nmp/automodel/tasks/training/runner.py b/services/automodel/src/nmp/automodel/tasks/training/runner.py new file mode 100644 index 00000000..2d893dde --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/runner.py @@ -0,0 +1,190 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Training runner with distributed coordination support. + +Orchestrates Automodel training in single-node and multi-node environments, +using file-based barriers for cross-pod synchronization. +""" + +import json +import logging +import random +import time +from enum import Enum +from pathlib import Path +from types import TracebackType + +import yaml +from nmp.automodel.app.constants import DEFAULT_TRAINING_RESULT_FILE_NAME +from nmp.automodel.app.jobs.context import NMPJobContext + +from .backends.backend import AUTOMODEL_CONFIG_FILENAME, AutomodelBackend +from .distributed import DistributedContext +from .errors.converter import create_error_details +from .progress import JobsServiceProgressReporter +from .protocol import LibraryConfig +from .schemas import ( + GPUInfo, + TrainingMetrics, + TrainingResult, + TrainingStepConfig, +) +from .utils import get_gpu_info + + +# Custom YAML representer to serialize Enum values as their string values +def _enum_representer(dumper: yaml.Dumper, data: Enum) -> yaml.Node: + """Represent Enum as its value (string) rather than a Python object tag.""" + return dumper.represent_str(str(data.value)) + + +yaml.add_representer(Enum, _enum_representer) +yaml.add_multi_representer(Enum, _enum_representer) + +logger = logging.getLogger(__name__) + +BARRIER_CONFIG_READY = "config_ready" +BARRIER_TRAINING_COMPLETE = "training_complete" + + +class TrainingRunner: + """ + Orchestrates Automodel training across single-node and multi-node environments. + + Usage: + with TrainingRunner() as runner: + result = runner.run() + """ + + def __init__(self, backend: AutomodelBackend | None = None) -> None: + self._job_ctx = NMPJobContext.from_env() + self._config = self._load_config(self._job_ctx.config_path) + self._progress = JobsServiceProgressReporter(self._job_ctx) + self._dist_ctx = DistributedContext.from_env(self._get_barrier_dir()) + self._backend = backend or AutomodelBackend(self._job_ctx) + self._workspace_path = Path(self._config.workspace_path) + self._output_path = Path(self._config.output_path) + + def __enter__(self) -> "TrainingRunner": + return self + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc_val: BaseException | None, + exc_tb: TracebackType | None, + ) -> None: + self.close() + + def close(self) -> None: + self._progress.close() + + def run(self) -> TrainingResult: + random.seed(self._config.seed) + logger.info(f"Global random seed set to {self._config.seed}") + + start_time = time.time() + gpu_info = get_gpu_info() + result = TrainingResult(success=False, error_message="No result") + + try: + library_config = self._compile_config_phase() + metrics = self._training_phase(library_config) + self._dist_ctx.sync_point(BARRIER_TRAINING_COMPLETE) + result = self._postprocess_phase(gpu_info, metrics, start_time, library_config) + + except Exception as e: + logger.exception(f"Training failed: {e}") + error_details = create_error_details(e) + result = TrainingResult( + success=False, + error_message=error_details.get("message", str(e)), + gpu_info=gpu_info, + training_duration_seconds=time.time() - start_time, + ) + if self._dist_ctx.is_coordinator: + self._progress.report_error(error_details) + finally: + self._write_result(result) + + return result + + def _get_barrier_dir(self) -> Path: + return self._job_ctx.storage_path / self._job_ctx.attempt_id / "distributed" / "barriers" + + def _load_config(self, config_path: Path) -> TrainingStepConfig: + with open(config_path) as f: + return TrainingStepConfig.model_validate(json.load(f)) + + def _get_library_config_path(self) -> Path: + return self._workspace_path / AUTOMODEL_CONFIG_FILENAME + + def _compile_config_phase(self) -> LibraryConfig: + config_path = self._get_library_config_path() + + if self._dist_ctx.is_coordinator: + self._progress.report_running("compiling_config") + config_dict = self._backend.compile_config(self._config, self._workspace_path) + config_path.parent.mkdir(parents=True, exist_ok=True) + with open(config_path, "w") as f: + yaml.dump(config_dict, f, default_flow_style=False) + logger.info(f"Library config written to: {config_path}") + self._dist_ctx.signal(BARRIER_CONFIG_READY) + return LibraryConfig(config_dict=config_dict, config_path=config_path) + + self._dist_ctx.wait_for_coordinator(BARRIER_CONFIG_READY) + return self._load_library_config(config_path) + + def _load_library_config(self, config_path: Path) -> LibraryConfig: + if not config_path.exists(): + raise FileNotFoundError( + f"Library config not found at {config_path}. Coordinator may not have written it yet." + ) + with open(config_path) as f: + config_dict = yaml.safe_load(f) + logger.info(f"Loaded library config from: {config_path}") + return LibraryConfig(config_dict=config_dict, config_path=config_path) + + def _training_phase(self, library_config: LibraryConfig) -> TrainingMetrics: + return self._backend.execute_training(self._config, library_config, self._progress) + + def _postprocess_phase( + self, + gpu_info: GPUInfo | None, + metrics: TrainingMetrics, + start_time: float, + library_config: LibraryConfig, + ) -> TrainingResult: + if not self._dist_ctx.is_coordinator: + return TrainingResult( + success=True, + gpu_info=gpu_info, + training_duration_seconds=time.time() - start_time, + ) + + self._progress.report_running("processing_checkpoint") + checkpoint_path = self._backend.find_best_checkpoint(self._workspace_path, self._config, library_config) + checkpoint_info = self._backend.process_checkpoint( + checkpoint_path, self._output_path, self._config, library_config + ) + + result = TrainingResult( + success=True, + checkpoint=checkpoint_info, + gpu_info=gpu_info, + metrics=metrics, + training_duration_seconds=time.time() - start_time, + ) + self._progress.report_completed("Training completed") + return result + + def _write_result(self, result: TrainingResult) -> None: + if not self._dist_ctx.is_coordinator: + return + result_path = self._workspace_path / DEFAULT_TRAINING_RESULT_FILE_NAME + result_path.parent.mkdir(parents=True, exist_ok=True) + with open(result_path, "w") as f: + f.write(result.model_dump_json(indent=2)) + logger.info(f"Result written to: {result_path}") diff --git a/services/automodel/src/nmp/automodel/tasks/training/schemas.py b/services/automodel/src/nmp/automodel/tasks/training/schemas.py new file mode 100644 index 00000000..4a5d493e --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/schemas.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from nmp.automodel.app.jobs.training.schemas import ( + CheckpointInfo, + DistillationConfig, + EmbeddingConfig, + GPUInfo, + LoRAConfig, + MLflowConfig, + ModelConfig, + OptimizerType, + TrainingMetrics, + TrainingResult, + TrainingStepConfig, + WandBConfig, +) +from nmp.automodel.entities.values import ( + CheckpointFormat, + FinetuningType, + Precision, + TrainingType, +) + +__all__ = [ + "CheckpointFormat", + "FinetuningType", + "Precision", + "TrainingType", + "CheckpointInfo", + "DistillationConfig", + "EmbeddingConfig", + "GPUInfo", + "LoRAConfig", + "MLflowConfig", + "ModelConfig", + "OptimizerType", + "TrainingMetrics", + "TrainingResult", + "TrainingStepConfig", + "WandBConfig", +] diff --git a/services/automodel/src/nmp/automodel/tasks/training/sequence_packing.py b/services/automodel/src/nmp/automodel/tasks/training/sequence_packing.py new file mode 100644 index 00000000..ae1948ee --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/sequence_packing.py @@ -0,0 +1,349 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Sequence packing utilities for Automodel training. + +Sequence packing combines multiple shorter sequences into a single packed sequence +to improve GPU utilization during training. This module provides: + +1. Optimal pack size calculation based on dataset statistics +2. Dataset sequence length estimation via sampling + +The algorithm balances packing efficiency with training stability by: +- Calculating a target packing factor from global batch size and GPU count +- Ensuring pack size is at least the max sequence length in the dataset +- Clamping to the model's maximum sequence length + +Usage with Automodel: + The `packed_sequence_size` calculated here should be passed to Automodel's + config under `packed_sequence.packed_sequence_size`. Automodel automatically + handles step calculation based on the packed dataset size - no manual + adjustment of max_steps or global_batch_size is needed. + +Reference: + - NeMo docs: https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html + - Automodel docs: https://github.com/NVIDIA-NeMo/Automodel/blob/main/docs/guides/llm/dataset.md#packed-sequence-support-in-nemo-automodel +""" + +import json +import logging +import math +import random +from dataclasses import dataclass +from pathlib import Path + +from nmp.automodel.app.constants import DEFAULT_SEED +from nmp.automodel.tasks.training.schemas import TrainingStepConfig + +logger = logging.getLogger(__name__) + + +@dataclass +class PackingEstimate: + """Statistics from dataset sampling for sequence packing configuration. + + This dataclass holds the results of sampling a dataset to estimate + sequence length statistics, which are used to calculate optimal + pack sizes for sequence packing. + + Attributes: + pack_size: Recommended pack size in tokens for Automodel's + `packed_sequence.packed_sequence_size` config + avg_seq_length: Average sequence length in the sampled data + max_seq_length: Maximum sequence length in the sampled data + packing_factor: Estimated number of sequences per pack + (pack_size / avg_seq_length) + samples_analyzed: Number of samples successfully tokenized + """ + + pack_size: int + avg_seq_length: int + max_seq_length: int + packing_factor: float + samples_analyzed: int + + +def _ceil_even(num: int | float) -> int: + """Round up to the nearest even number. + + NeMo/Automodel prefer even sequence lengths for efficiency with + tensor parallelism and other optimizations. + + Examples: + >>> _ceil_even(3) + 4 + >>> _ceil_even(4) + 4 + >>> _ceil_even(5.5) + 6 + """ + return int(math.ceil(num / 2) * 2) + + +def calculate_optimal_pack_size( + config: TrainingStepConfig, + dataset_avg_seq_length: int | None = None, + dataset_max_seq_length: int | None = None, +) -> int: + """ + Calculate optimal pack size for sequence packing. + + This algorithm balances packing efficiency with training stability: + 1. Target packing_factor = global_batch_size / total_gpus + 2. target_pack_size = avg_seq_length * packing_factor (but at least max_seq_length) + 3. Clamp to model's max_seq_length + + The packing factor determines how many sequences fit into one packed sequence. + A higher packing factor means better GPU utilization but may affect convergence + if pack sizes become very large. + + If dataset statistics are not provided, uses model's max_seq_length as a + conservative default (which effectively disables the optimization). + + Args: + config: Training configuration containing parallelism, batch, and model settings + dataset_avg_seq_length: Average sequence length in the dataset (after tokenization) + dataset_max_seq_length: Maximum sequence length in the dataset + + Returns: + Optimal pack size in tokens + + Example: + For a setup with: + - global_batch_size = 32 + - 8 GPUs (num_nodes=1, num_gpus_per_node=8) + - avg_seq_length = 512 + - max_seq_length = 1024 + - model.max_seq_length = 4096 + + Calculation: + - packing_factor = 32 / 8 = 4 + - target_pack_size = ceil_even(512 * 4) = 2048 + - final = max(2048, 1024) = 2048 (clamped to 4096) = 2048 + """ + parallelism = config.parallelism + total_gpus = parallelism.num_nodes * parallelism.num_gpus_per_node + gbs = config.batch.global_batch_size + model_max_seq = config.model.max_seq_length + + # If no dataset stats provided, use model's max_seq_length (conservative) + if dataset_avg_seq_length is None or dataset_max_seq_length is None: + logger.info(f"No dataset statistics provided, using model max_seq_length: {model_max_seq}") + return model_max_seq + + # Calculate target packing factor (how many sequences can fit in one pack) + # This keeps the effective batch size close to the original gbs + target_packing_factor = max(gbs // total_gpus, 1) + + # Calculate pack size based on average sequence length + # Round to nearest even number for efficiency + target_pack_size = _ceil_even(round(dataset_avg_seq_length * target_packing_factor)) + + # Ensure pack size is at least the max sequence length in the dataset + # (so no sequence gets truncated due to packing) + target_pack_size = max(target_pack_size, dataset_max_seq_length) + + # Clamp to model's maximum sequence length + optimal_pack_size = min(target_pack_size, model_max_seq) + + logger.info( + f"Calculated optimal pack size: {optimal_pack_size} " + f"(avg_seq={dataset_avg_seq_length}, max_seq={dataset_max_seq_length}, " + f"packing_factor={target_packing_factor})" + ) + + return optimal_pack_size + + +def estimate_dataset_sequence_lengths( + config: TrainingStepConfig, + train_file: Path | None = None, + max_samples: int = 1000, + seed: int = DEFAULT_SEED, + trust_remote_code: bool = False, +) -> PackingEstimate | None: + """ + Estimate dataset sequence lengths by sampling and calculate optimal pack size. + + This is a lightweight alternative to full tokenization that uses reservoir + sampling to randomly select a subset of the dataset for sequence length + estimation. The sampling is unbiased regardless of dataset ordering. + + The function: + 1. Loads the model's tokenizer + 2. Randomly samples up to `max_samples` examples using reservoir sampling + 3. Tokenizes each example (using apply_chat_template for chat format) + 4. Calculates optimal pack size based on the statistics + + NOTE: Sampling may underestimate max_seq_length for datasets with rare + long sequences. The pack size calculation accounts for this by clamping + to the model's max_seq_length. + + Args: + config: Training configuration with dataset and model paths + train_file: Path to the prepared training JSONL file. When provided + this file is used directly; otherwise falls back to + ``config.dataset.path / "train.jsonl"``. + max_samples: Maximum number of samples to analyze (default: 1000) + seed: Random seed for reproducible sampling (default: 1111) + trust_remote_code: Whether to trust remote code (default: False) + + Returns: + PackingEstimate with pack_size and statistics, or None if estimation fails + """ + + try: + if train_file is None: + train_file = Path(config.dataset.path) / "train.jsonl" + + if not train_file.exists(): + logger.warning(f"Training file not found: {train_file}") + return None + + # Import here to avoid ModuleNotFoundError in environments where + # transformers is not installed (e.g., during test collection) + from transformers import AutoTokenizer + + # Load tokenizer from model + tokenizer = AutoTokenizer.from_pretrained( + config.model.path, + trust_remote_code=trust_remote_code, + ) + + random.seed(seed) + + # Sample examples to estimate lengths + lengths = _sample_sequence_lengths(train_file, tokenizer, max_samples) + + if not lengths: + logger.warning("Could not estimate sequence lengths from dataset") + return None + + avg_length = _ceil_even(int(sum(lengths) / len(lengths))) + max_length = _ceil_even(max(lengths)) + + # Calculate optimal pack size + pack_size = calculate_optimal_pack_size(config, avg_length, max_length) + packing_factor = pack_size / avg_length if avg_length > 0 else 1.0 + + estimate = PackingEstimate( + pack_size=pack_size, + avg_seq_length=avg_length, + max_seq_length=max_length, + packing_factor=round(packing_factor, 2), + samples_analyzed=len(lengths), + ) + + logger.info( + f"Packing estimate from {len(lengths)} samples: " + f"pack_size={pack_size}, avg_seq={avg_length}, max_seq={max_length}, " + f"packing_factor={estimate.packing_factor:.2f}" + ) + + return estimate + + except Exception as e: + logger.warning(f"Failed to estimate sequence lengths: {e}") + return None + + +def _sample_sequence_lengths( + train_file: Path, + tokenizer, + max_samples: int, +) -> list[int]: + """ + Sample sequences from a JSONL file and return their tokenized lengths. + + Uses reservoir sampling for unbiased random selection, then tokenizes + each sample to measure its length. For chat format, uses apply_chat_template + to get accurate lengths including role tokens and formatting. + + Args: + train_file: Path to training JSONL file + tokenizer: HuggingFace tokenizer + max_samples: Maximum samples to return + + Returns: + List of sequence lengths (in tokens) + """ + # Reservoir sampling to select samples + samples: list[str] = [] + with open(train_file, "r") as f: + for i, line in enumerate(f): + if i < max_samples: + samples.append(line) + else: + j = random.randint(0, i) + if j < max_samples: + samples[j] = line + + # Tokenize samples to get lengths + lengths = [] + for line in samples: + try: + obj = json.loads(line) + length = _get_sample_token_length(obj, tokenizer) + if length is not None: + lengths.append(length) + except Exception: + # Skip malformed lines + continue + + return lengths + + +def _get_sample_token_length(obj: dict, tokenizer) -> int | None: + """ + Get the tokenized length of a dataset sample. + + For chat format, uses apply_chat_template to accurately measure length + including role tokens, special tokens, and formatting. Falls back to + simple text concatenation for other formats or if chat template fails. + + Args: + obj: Parsed JSON object from dataset + tokenizer: HuggingFace tokenizer + + Returns: + Token count, or None if sample is empty/invalid + """ + # Chat format: use apply_chat_template for accurate length + if "messages" in obj: + messages = obj["messages"] + if messages and hasattr(tokenizer, "apply_chat_template"): + try: + tokens = tokenizer.apply_chat_template( + messages, + add_generation_prompt=False, + tokenize=True, + ) + return len(tokens) + except Exception: + # Fall back to text extraction if chat template fails + pass + + # Fallback: concatenate role + content + parts = [] + for m in messages: + if isinstance(m, dict): + role = m.get("role", "") + content = m.get("content", "") + if role or content: + parts.append(f"{role}: {content}") + text = "\n".join(parts) + if text: + return len(tokenizer.encode(text, add_special_tokens=True)) + return None + + # SFT format: prompt + completion + if "prompt" in obj and "completion" in obj: + text = str(obj["prompt"]) + " " + str(obj["completion"]) + return len(tokenizer.encode(text, add_special_tokens=True)) + + # Generic: concatenate all string values + text = " ".join(str(v) for v in obj.values() if isinstance(v, str)) + if text: + return len(tokenizer.encode(text, add_special_tokens=True)) + return None diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.1-instruct.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.1-instruct.jinja new file mode 100644 index 00000000..e074cba5 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.1-instruct.jinja @@ -0,0 +1,61 @@ +{{- bos_token }} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- set loop_messages = messages %} +{%- if tools is not none and tool_choice is not none %} + {{- '<|start_header_id|>system<|end_header_id|>\n\n' }} + {{- "Environment: ipython\n\n" }} + {{- "Cutting Knowledge Date: December 2023\n" }} + {{- "Today Date: " + date_string + "\n\n" }} + {{- "You are a helpful assistant.\n" }} + {{- '<|eot_id|>' }} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} + {{- 'You have access to the following functions to supplement your existing knowledge:\n\n' }} + {%- for t in tools %} + {%- set tname = t.function.name %} + {%- set tdesc = t.function.description %} + {%- set tparams = t.function.parameters | tojson %} + {{- "Use the function '" + tname + "' to '" + tdesc + "':\n" }} + {{- '{"name": "' + tname + '", "description": "' + tdesc + '", "parameters": ' + tparams + '}\n\n' }} + {%- endfor %} + {{- 'Think very carefully before calling functions.\n' }} + {{- 'Only call them if they are relevant to the prompt.\n' }} + {{- 'If you choose to call a function ONLY reply in the following format with no natural language surrounding it:\n\n' }} + {{- '{"example_name": "example_value"}\n\n' }} + {{- 'Reminder:\n' }} + {{- '- Function calls MUST follow the specified format, start with \n' }} + {{- '- Required parameters MUST be specified\n' }} + {{- '- Only call one function at a time\n' }} + {{- '- Put the entire function call reply on one line\n' }} + {{- '- Do not call functions if they are not relevant to the prompt' }} + {{- '<|eot_id|>' }} +{%- endif %} +{%- for message in loop_messages %} + {%- if message['role'] in ['ipython', 'tool'] %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {{- "[stdout]" + message['content'] | trim + "[/stdout]\n<|eot_id|>" }} + {%- elif message['role'] == 'assistant'%} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {%- if message.get('tool_calls') is not none %} + {%- set tool_call = message['tool_calls'][0] %} + {%- generation %} + {{- '<|python_tag|>' + tool_call.function.arguments | tojson + '\n<|eot_id|>' }} + {%- endgeneration %} + {%- else %} + {%- generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endgeneration %} + {%- endif %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.2-instruct.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.2-instruct.jinja new file mode 100644 index 00000000..e074cba5 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.2-instruct.jinja @@ -0,0 +1,61 @@ +{{- bos_token }} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- set loop_messages = messages %} +{%- if tools is not none and tool_choice is not none %} + {{- '<|start_header_id|>system<|end_header_id|>\n\n' }} + {{- "Environment: ipython\n\n" }} + {{- "Cutting Knowledge Date: December 2023\n" }} + {{- "Today Date: " + date_string + "\n\n" }} + {{- "You are a helpful assistant.\n" }} + {{- '<|eot_id|>' }} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} + {{- 'You have access to the following functions to supplement your existing knowledge:\n\n' }} + {%- for t in tools %} + {%- set tname = t.function.name %} + {%- set tdesc = t.function.description %} + {%- set tparams = t.function.parameters | tojson %} + {{- "Use the function '" + tname + "' to '" + tdesc + "':\n" }} + {{- '{"name": "' + tname + '", "description": "' + tdesc + '", "parameters": ' + tparams + '}\n\n' }} + {%- endfor %} + {{- 'Think very carefully before calling functions.\n' }} + {{- 'Only call them if they are relevant to the prompt.\n' }} + {{- 'If you choose to call a function ONLY reply in the following format with no natural language surrounding it:\n\n' }} + {{- '{"example_name": "example_value"}\n\n' }} + {{- 'Reminder:\n' }} + {{- '- Function calls MUST follow the specified format, start with \n' }} + {{- '- Required parameters MUST be specified\n' }} + {{- '- Only call one function at a time\n' }} + {{- '- Put the entire function call reply on one line\n' }} + {{- '- Do not call functions if they are not relevant to the prompt' }} + {{- '<|eot_id|>' }} +{%- endif %} +{%- for message in loop_messages %} + {%- if message['role'] in ['ipython', 'tool'] %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {{- "[stdout]" + message['content'] | trim + "[/stdout]\n<|eot_id|>" }} + {%- elif message['role'] == 'assistant'%} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {%- if message.get('tool_calls') is not none %} + {%- set tool_call = message['tool_calls'][0] %} + {%- generation %} + {{- '<|python_tag|>' + tool_call.function.arguments | tojson + '\n<|eot_id|>' }} + {%- endgeneration %} + {%- else %} + {%- generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endgeneration %} + {%- endif %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.3-instruct.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.3-instruct.jinja new file mode 100644 index 00000000..a0ba6017 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/templates/llama-3.3-instruct.jinja @@ -0,0 +1,61 @@ +{{- bos_token }} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- set loop_messages = messages %} +{%- if tools is not none and tool_choice is not none %} + {{- '<|start_header_id|>system<|end_header_id|>\n\n' }} + {{- "Environment: ipython\n\n" }} + {{- "Cutting Knowledge Date: December 2023\n" }} + {{- "Today Date: " + date_string + "\n\n" }} + {{- "You are a helpful assistant.\n" }} + {{- '<|eot_id|>' }} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} + {{- 'You have access to the following functions to supplement your existing knowledge:\n\n' }} + {%- for t in tools %} + {%- set tname = t.function.name %} + {%- set tdesc = t.function.description %} + {%- set tparams = t.function.parameters | tojson %} + {{- "Use the function '" + tname + "' to '" + tdesc + "':\n" }} + {{- '{"name": "' + tname + '", "description": "' + tdesc + '", "parameters": ' + tparams + '}\n\n' }} + {%- endfor %} + {{- 'Think very carefully before calling functions.\n' }} + {{- 'Only call them if they are relevant to the prompt.\n' }} + {{- 'If you choose to call a function ONLY reply in the following format with no natural language surrounding it:\n\n' }} + {{- '{"example_name": "example_value"}\n\n' }} + {{- 'Reminder:\n' }} + {{- '- Function calls MUST follow the specified format, start with \n' }} + {{- '- Required parameters MUST be specified\n' }} + {{- '- Only call one function at a time\n' }} + {{- '- Put the entire function call reply on one line\n' }} + {{- '- Do not call functions if they are not relevant to the prompt' }} + {{- '<|eot_id|>' }} +{%- endif %} +{%- for message in loop_messages %} + {%- if message['role'] in ['ipython', 'tool'] %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {{- "[stdout]" + message['content'] | trim + "[/stdout]\n<|eot_id|>" }} + {%- elif message['role'] == 'assistant'%} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {%- if message.get('tool_calls') is not none %} + {%- set tool_call = message['tool_calls'][0] %} + {%- generation %} + {{- '' + tool_call.function.arguments | tojson + '\n<|eot_id|>' }} + {%- endgeneration %} + {%- else %} + {%- generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endgeneration %} + {%- endif %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.1.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.1.jinja new file mode 100644 index 00000000..00cfd85e --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.1.jinja @@ -0,0 +1,51 @@ +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content'] | trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = '' %} +{%- endif %} +{%- if tools is not none %} + {{- '<|begin_of_text|><|start_header_id|>system<|end_header_id|>' + '\n\n' + system_message }} + {{- '\n\n' if system_message else '' }} + {{- '[' }} + {%- for t in tools %} + {{- (t.function if t.function is defined else t) | tojson() }} + {{- ', ' if not loop.last else '' }} + {%- endfor %} + {{- ']' }} + {{- '<|eot_id|>' }} +{%- else %} + {{- '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n' + system_message + '<|eot_id|>' }} +{%- endif %} +{%- for message in messages %} + {%- if (message['role'] in ['user', 'tool']) != (loop.index0 % 2 == 0) %} + {{- raise_exception('Conversation roles must alternate between user/tool and assistant') }} + {%- elif message['role'] == 'user' %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }} + {%- elif message['role'] == 'tool' %} + {%- set tool_response = '[' + message['content'] | trim + ']' %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' + tool_response + '<|eot_id|>' }} + {%- elif message['role'] == 'assistant' and message.get('tool_calls') is not none %} + {%- set tool_calls = message['tool_calls'] %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {%- generation %} + {{- '['}} + {%- for tool_call in tool_calls %} + {{- '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson + '}' }} + {%- if not loop.last %} + {{- ', ' }} + {%- else %} + {{- ']<|eot_id|>' }} + {%- endif %} + {%- endfor %} + {%- endgeneration %} + {%- elif message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {%- generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endgeneration %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.3.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.3.jinja new file mode 100644 index 00000000..7530a8c8 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-3.3.jinja @@ -0,0 +1,21 @@ +{{- bos_token }} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = '' %} +{%- endif %} +{{- '<|start_header_id|>system<|end_header_id|>\n\n' }} +{{- system_message }} +{{- '<|eot_id|>' }} +{%- for message in messages %} + {%- if message['role'] == 'assistant' and '' in message['content'] %} + {%- set content = message['content'].split('')[-1].lstrip() %} + {%- else %} + {%- set content = message['content'] %} + {%- endif %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + content | trim + '<|eot_id|>' }} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }} +{%- endif %} diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-super-3.3.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-super-3.3.jinja new file mode 100644 index 00000000..1deec263 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/templates/nemotron-super-3.3.jinja @@ -0,0 +1,82 @@ +{{- bos_token }} +{%- set ns = namespace(p='', has_tools=False) %} +{%- if tools is not none and tool_choice is not none %} + {%- set ns.has_tools = True %} + {%- set ns.p = ns.p + 'You are an expert in composing functions. You are given a question and a set of possible functions. ' %} + {%- set ns.p = ns.p + 'Based on the question, you will need to make one or more function/tool calls to achieve the purpose. ' %} + {%- set ns.p = ns.p + 'If none of the function can be used, point it out. ' %} + {%- set ns.p = ns.p + 'If the given question lacks the parameters required by the function, also point it out. ' %} + {%- set ns.p = ns.p + 'You should only return the function call in tools call sections. ' %} + {%- set ns.p = ns.p + 'Here is a list of functions in JSON format that you can invoke.\n' %} + {%- set ns.p = ns.p + '[' %} + {%- for tool in tools %} + {%- set function = tool.function %} + {%- set keys = function.keys() | reject('equalto', 'return') | list %} + {%- set ns.p = ns.p + '{"type": "function", "function": {' %} + {%- for key in keys %} + {%- set val = function[key] %} + {%- if val is string %} + {%- set ns.p = ns.p + '"' + key + '": "' + val + '"' %} + {%- else %} + {%- set ns.p = ns.p + '"' + key + '": ' + val|tojson %} + {%- endif %} + {%- if not loop.last %} + {%- set ns.p = ns.p + ', ' %} + {%- endif %} + {%- endfor %} + {%- set ns.p = ns.p + '}}' %} + {%- if not loop.last %} + {%- set ns.p = ns.p + ', ' %} + {%- endif %} + {%- endfor %} + {%- set ns.p = ns.p + ']\n' %} + {%- set ns.p = ns.p + 'If you decide to invoke any of the function(s), put it in the JSON TOOL CALLING format of ' %} + {%- set ns.p = ns.p + '[{"name": "func_name1", "arguments": {"params_name1": "params_value1", "params_name2": "params_value2"}}, ' %} + {%- set ns.p = ns.p + '{"name": "func_name2", "arguments": {"params_name1": "params_value1", "params_name2": "params_value2"}}] ' %} + {%- set ns.p = ns.p + '\n' %} + {%- set ns.p = ns.p + 'You SHOULD NOT include any other information in the response. REMEMBER TO USE JSON TOOL CALLING FORMAT.\n\n' %} +{%- endif %} +{%- for message in messages %} + {%- if message['role'] == 'user' %} + {%- if ns.has_tools %} + {%- if add_generation_prompt and loop.index0 == ((messages | length) - 1) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + ns.p + (message['content'] | trim) + '<|eot_id|>' }} + {%- elif not add_generation_prompt and loop.index0 == ((messages | length) - 2) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + ns.p + (message['content'] | trim) + '<|eot_id|>' }} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' }} + {%- endif %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' }} + {%- endif %} + {%- elif message['role'] in ['ipython', 'tool'] %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' }} + {{- 'Here are the results from the tool:' + (message['content'] | trim) + '<|eot_id|>' }} + {%- elif message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {%- generation %} + {%- if message.get('tool_calls') is not none %} + {{- '[' }} + {%- for tool_call in message['tool_calls'] %} + {{- '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson }} + {%- if tool_call.get('id') is not none %} + {{- ', "id": "' + tool_call.id + '"' }} + {%- endif %} + {{- '}' }} + {%- if not loop.last %} + {{- ', ' }} + {%- endif %} + {%- endfor %} + {{- ']' }} + {{- '<|eot_id|>' }} + {%- else %} + {{- message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + (message['content'] | trim) + '<|eot_id|>' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/services/automodel/src/nmp/automodel/tasks/training/templates/phi-4.jinja b/services/automodel/src/nmp/automodel/tasks/training/templates/phi-4.jinja new file mode 100644 index 00000000..33a466f8 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/templates/phi-4.jinja @@ -0,0 +1,15 @@ +{%- for message in messages %} + {%- if (message['role'] == 'system') %} + {{- '<|im_start|>system<|im_sep|>' + message['content'] + '<|im_end|>'}} + {%- elif (message['role'] == 'user') %} + {{-'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}} + {%- elif (message['role'] == 'assistant') %} + {{- '<|im_start|>assistant<|im_sep|>' }} + {%- generation %} + {{- message['content'] + '<|im_end|>'}} + {%- endgeneration %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant<|im_sep|>' }} +{%- endif %} diff --git a/services/automodel/src/nmp/automodel/tasks/training/utils.py b/services/automodel/src/nmp/automodel/tasks/training/utils.py new file mode 100644 index 00000000..afbf73f9 --- /dev/null +++ b/services/automodel/src/nmp/automodel/tasks/training/utils.py @@ -0,0 +1,93 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os + +from nmp.common.jobs.constants import NEMO_JOB_ID_ENVVAR + +from .schemas import GPUInfo + +logger = logging.getLogger(__name__) + + +def _get_architecture_name(major: int, minor: int) -> str: + """Map CUDA compute capability to architecture name. + + https://developer.nvidia.com/cuda-gpus + """ + if major == 3: + return "Kepler" + if major == 5: + return "Maxwell" + if major == 6: + return "Pascal" + if major == 7: + # 7.0/7.2 = Volta, 7.5 = Turing + if minor >= 5: + return "Turing" + return "Volta" + if major == 8: + return "Ampere" + if major == 9: + return "Hopper" + if major == 10: + return "Blackwell" + return f"Unknown (sm_{major}{minor})" + + +def get_gpu_info() -> GPUInfo | None: + """Capture GPU architecture information.""" + try: + import torch + + if not torch.cuda.is_available(): + return None + + device_id = torch.cuda.current_device() + props = torch.cuda.get_device_properties(device_id) + major, minor = torch.cuda.get_device_capability(device_id) + + return GPUInfo( + architecture=_get_architecture_name(major, minor), + device_name=props.name, + memory_gb=props.total_memory / (1024**3), + cuda_version=str(torch.version.cuda), + ) + except Exception as e: + logger.warning(f"Failed to capture GPU info: {e}") + return None + + +def generate_torchrun_flags_from_env() -> list[str]: + """Generate torchrun flags for distributed training.""" + # These values are typically injected by the Volcano/PyTorch operator + # or the Core Jobs Service when using DistributedGPUExecutionProvider. + master_addr = os.environ.get("MASTER_ADDR", "localhost") + master_port = os.environ.get("MASTER_PORT", "23456") # Default to port from volcano_job.py + node_rank = os.environ.get("NODE_RANK", os.environ.get("RANK", "0")) + num_nodes = os.environ.get("WORLD_SIZE", "1") + gpus_per_node = os.environ.get("GPUS_PER_NODE") + if gpus_per_node is None: + try: + import torch + + gpus_per_node = str(torch.cuda.device_count()) + except Exception as e: + logger.warning(f"Failed to determine number of GPUs: {e}, using default of 1") + gpus_per_node = "1" + + return [ + "--nnodes", + num_nodes, + "--nproc_per_node", + gpus_per_node, + "--node_rank", + node_rank, + "--rdzv_id", + os.environ.get(NEMO_JOB_ID_ENVVAR, "customizer-rdzv"), + "--rdzv_backend", + "c10d", + "--rdzv_endpoint", + f"{master_addr}:{master_port}", + ] diff --git a/services/automodel/tests/test_adapter.py b/services/automodel/tests/test_adapter.py new file mode 100644 index 00000000..dfd1f44b --- /dev/null +++ b/services/automodel/tests/test_adapter.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from nmp.automodel.adapter import automodel_spec_to_compiler_output +from nmp.automodel.api.v2.jobs.schemas import DistillationTraining, SFTTraining + + +def test_adapter_sft() -> None: + spec = automodel_spec_to_compiler_output( + { + "model": "meta/llama", + "dataset": {"training": "default/train"}, + "training": {"training_type": "sft", "finetuning_type": "lora"}, + "output": {"name": "out", "type": "adapter", "fileset": "out-fs"}, + }, + ) + assert isinstance(spec.training, SFTTraining) + assert spec.dataset == "default/train" + + +def test_adapter_distillation() -> None: + spec = automodel_spec_to_compiler_output( + { + "model": "meta/llama", + "dataset": {"training": "default/train"}, + "training": { + "training_type": "distillation", + "finetuning_type": "all_weights", + "teacher_model": "meta/teacher", + }, + "output": {"name": "out", "type": "model", "fileset": "out-fs"}, + }, + ) + assert isinstance(spec.training, DistillationTraining) + assert spec.training.teacher_model == "meta/teacher" diff --git a/services/automodel/tests/test_compiler.py b/services/automodel/tests/test_compiler.py new file mode 100644 index 00000000..6805b611 --- /dev/null +++ b/services/automodel/tests/test_compiler.py @@ -0,0 +1,151 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path +from unittest.mock import AsyncMock, Mock + +import pytest +from nemo_platform import AsyncNeMoPlatform +from nemo_platform.types.models.model_entity import ModelEntity +from nmp.automodel.adapter import automodel_spec_to_compiler_output +from nmp.automodel.api.v2.jobs.schemas import CustomizationJobOutput, LoRAParams, OutputResponse, SFTTraining +from nmp.automodel.app.jobs.compiler import _build_file_download_config +from nmp.automodel.compile import platform_job_config_compiler +from nmp.automodel.images import DEFAULT_AUTOMODEL_IMAGE_REGISTRY, TASKS_IMAGE_NAME, TRAINING_IMAGE_NAME +from nmp.common.entities.utils import get_random_id +from nmp.common.jobs.exceptions import PlatformJobCompilationError + + +def _make_mock_model_entity( + workspace: str = "default", + name: str = "test-target", + fileset: str | None = "default/base-model", +) -> ModelEntity: + return ModelEntity( + id=get_random_id("model"), + workspace=workspace, + name=name, + fileset=fileset, + trust_remote_code=False, + finetuning_type=None, + created_at=datetime.now(), + updated_at=datetime.now(), + ) + + +@pytest.fixture +def mock_sdk(): + sdk = Mock(spec=AsyncNeMoPlatform) + sdk.models = Mock() + sdk.models.retrieve = AsyncMock( + side_effect=lambda name, workspace, verbose=True: _make_mock_model_entity(workspace=workspace, name=name), + ) + sdk.files = Mock() + sdk.files.filesets = Mock() + sdk.files.filesets.retrieve = AsyncMock(return_value=Mock()) + return sdk + + +def _make_job_output() -> CustomizationJobOutput: + return CustomizationJobOutput( + model="default/test-target", + dataset="default/my-dataset", + training=SFTTraining( + peft=LoRAParams(rank=8, alpha=32, merge=False), + learning_rate=1e-4, + batch_size=4, + micro_batch_size=1, + max_seq_length=2048, + ), + output=OutputResponse(name="out", type="adapter", fileset="out-fs"), + ) + + +def test_build_file_download_config_rejects_missing_model_fileset() -> None: + with pytest.raises(PlatformJobCompilationError, match="has no fileset"): + _build_file_download_config(_make_job_output(), _make_mock_model_entity(fileset=None)) + + +@pytest.mark.asyncio +async def test_platform_job_config_compiler_sft_lora(mock_sdk, monkeypatch): + monkeypatch.setattr( + "nmp.automodel.app.jobs.compiler.fetch_model_entity", + AsyncMock(return_value=_make_mock_model_entity()), + ) + contract_dir = Path(__file__).resolve().parents[3] / "tests" / "customizer-automodel-contract" / "input_configs" + input_path = contract_dir / "llama-3.2-1b" / "llama_3_2_1b_lora.json" + if not input_path.exists(): + pytest.skip("contract configs not present") + + raw = json.loads(input_path.read_text()) + plugin_shape = { + "model": raw["model"]["path"], + "dataset": {"training": "default/train-data"}, + "training": { + "training_type": "sft", + "finetuning_type": "lora", + "lora": { + "rank": raw["training"]["lora"]["rank"], + "alpha": raw["training"]["lora"]["alpha"], + "merge": False, + }, + "max_seq_length": raw["model"]["max_seq_length"], + }, + "schedule": { + "epochs": raw["schedule"]["epochs"], + "max_steps": raw["schedule"]["max_steps"], + }, + "batch": { + "global_batch_size": raw["batch"]["global_batch_size"], + "micro_batch_size": raw["batch"]["micro_batch_size"], + }, + "optimizer": {"learning_rate": raw["optimizer"]["learning_rate"]}, + "parallelism": { + "num_nodes": raw["parallelism"]["num_nodes"], + "num_gpus_per_node": raw["parallelism"]["num_gpus_per_node"], + "tensor_parallel_size": raw["parallelism"]["tensor_parallel_size"], + }, + "output": {"name": "test-out", "type": "adapter", "fileset": "test-out-fs"}, + } + compiler_spec = automodel_spec_to_compiler_output(plugin_shape) + spec = await platform_job_config_compiler(compiler_spec, "default", mock_sdk) + + steps = spec.steps if hasattr(spec, "steps") else spec["steps"] + assert len(steps) == 4 + training_step = steps[1] + training_name = training_step.name if hasattr(training_step, "name") else training_step["name"] + assert training_name == "customization-training-job" + training_cmd = ( + training_step.executor.container.command + if hasattr(training_step, "executor") + else training_step["executor"]["container"]["command"] + ) + assert "nmp.automodel.tasks.training" in " ".join(training_cmd) + download_cmd = ( + steps[0].executor.container.command + if hasattr(steps[0], "executor") + else steps[0]["executor"]["container"]["command"] + ) + assert download_cmd[-1] == "nmp.automodel.tasks.file_io" + download_entrypoint = ( + steps[0].executor.container.entrypoint + if hasattr(steps[0], "executor") + else steps[0]["executor"]["container"]["entrypoint"] + ) + assert download_entrypoint == ["/opt/venv/bin/python"] + + def _step_image(step) -> str: + if hasattr(step, "executor"): + return step.executor.container.image + return step["executor"]["container"]["image"] + + tasks_image = f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TASKS_IMAGE_NAME}" + training_image = f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TRAINING_IMAGE_NAME}" + assert _step_image(steps[0]).startswith(tasks_image) + assert _step_image(steps[1]).startswith(training_image) + assert _step_image(steps[2]).startswith(tasks_image) + assert _step_image(steps[3]).startswith(tasks_image) diff --git a/services/automodel/tests/test_contract_configs.py b/services/automodel/tests/test_contract_configs.py new file mode 100644 index 00000000..0d808033 --- /dev/null +++ b/services/automodel/tests/test_contract_configs.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Contract continuity: compile_automodel_config import path and optional snapshot check.""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +REPO_ROOT = Path(__file__).resolve().parents[3] +CONTRACT_DIR = REPO_ROOT / "tests" / "customizer-automodel-contract" +GENERATE_SCRIPT = CONTRACT_DIR / "generate_configs.py" + +# v1 excludes embedding SFT until product expands scope. +EMBEDDING_CONFIG_STEMS = {"embed_1b_lora", "embed_1b_full_sft"} + + +@pytest.mark.skipif(not CONTRACT_DIR.is_dir(), reason="contract fixtures not in tree") +def test_generate_configs_import_path() -> None: + """generate_configs.py must import compile_automodel_config from backends.config.""" + text = GENERATE_SCRIPT.read_text() + assert "backends.config import compile_automodel_config" in text + assert "backends.automodel.config" not in text + + +@pytest.mark.skipif(not CONTRACT_DIR.is_dir(), reason="contract fixtures not in tree") +@pytest.mark.parametrize( + "config_name", + [ + "llama_3_2_1b_lora", + "llama_3_2_1b_lora_packing", + "nemotron_nano_lora_packing", + ], +) +def test_contract_input_parses_as_training_step_config(config_name: str) -> None: + from nmp.automodel.tasks.training.schemas import TrainingStepConfig + + input_path = CONTRACT_DIR / "input_configs" / "llama-3.2-1b" / f"{config_name}.json" + if config_name.startswith("nemotron"): + input_path = CONTRACT_DIR / "input_configs" / "nemotron-nano" / f"{config_name}.json" + if not input_path.exists(): + pytest.skip(f"missing {input_path}") + + raw = json.loads(input_path.read_text()) + raw.pop("backend", None) + TrainingStepConfig.model_validate(raw) + + +@pytest.mark.skipif(not CONTRACT_DIR.is_dir(), reason="contract fixtures not in tree") +def test_contract_output_configs_up_to_date_excluding_embedding() -> None: + """Run generate_configs --check when nemo_automodel is available in the environment.""" + pytest.importorskip("nemo_automodel") + if not GENERATE_SCRIPT.is_file(): + pytest.skip("generate_configs.py missing") + + env = dict(**__import__("os").environ) + env["PYTHONPATH"] = str(REPO_ROOT / "services" / "automodel" / "src") + + result = subprocess.run( + [sys.executable, str(GENERATE_SCRIPT), "--check"], + cwd=CONTRACT_DIR, + env=env, + capture_output=True, + text=True, + ) + if result.returncode != 0: + combined = result.stdout + result.stderr + for stem in EMBEDDING_CONFIG_STEMS: + if stem in combined: + pytest.skip("contract check failed on embedding configs (excluded from v1)") + if "nemo_automodel" in combined and "ModuleNotFoundError" in combined: + pytest.skip("nemo_automodel not installed in test env (run in training image CI)") + pytest.fail(f"contract configs out of date:\n{combined}") diff --git a/services/automodel/tests/test_images.py b/services/automodel/tests/test_images.py new file mode 100644 index 00000000..4a633491 --- /dev/null +++ b/services/automodel/tests/test_images.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from nmp.automodel.config import AutomodelConfig +from nmp.automodel.images import ( + DEFAULT_AUTOMODEL_IMAGE_REGISTRY, + TASKS_IMAGE_NAME, + TRAINING_IMAGE_NAME, + get_automodel_qualified_image, + get_tasks_image, + get_training_image, +) + + +def test_default_automodel_images_use_nvcr_dev_registry(monkeypatch): + monkeypatch.setattr("nmp.automodel.images.config", AutomodelConfig()) + + tasks = get_tasks_image() + training = get_training_image() + + assert tasks == f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TASKS_IMAGE_NAME}:local" + assert training == f"{DEFAULT_AUTOMODEL_IMAGE_REGISTRY}/{TRAINING_IMAGE_NAME}:local" + assert TASKS_IMAGE_NAME.count("/") == 0 # NVCR: single repo segment, no nested paths + + +def test_automodel_image_registry_override(monkeypatch): + monkeypatch.setattr( + "nmp.automodel.images.config", + AutomodelConfig(image_registry="nvcr.io/0921617854601259/other-registry"), + ) + + assert ( + get_automodel_qualified_image(TASKS_IMAGE_NAME) + == "nvcr.io/0921617854601259/other-registry/nmp-automodel-tasks:local" + ) + + +def test_automodel_full_image_override(monkeypatch): + monkeypatch.setattr( + "nmp.automodel.images.config", + AutomodelConfig( + tasks_image="nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-tasks:dev", + ), + ) + + assert get_tasks_image() == "nvcr.io/0921617854601259/nemo-platform-dev/nmp-automodel-tasks:dev" diff --git a/services/automodel/tests/test_job_context.py b/services/automodel/tests/test_job_context.py new file mode 100644 index 00000000..55efea6a --- /dev/null +++ b/services/automodel/tests/test_job_context.py @@ -0,0 +1,73 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for NMPJobContext.""" + +from pathlib import Path + +import pytest +from nmp.automodel.app.constants import DEFAULT_JOB_STORAGE_PATH, NMP_FILES_URL_ENVVAR, NMP_JOBS_URL_ENVVAR +from nmp.automodel.app.jobs.context import ( + DEFAULT_ATTEMPT_ID, + DEFAULT_JOB_ID, + DEFAULT_STEP, + DEFAULT_TASK, + NMPJobContext, +) +from nmp.common.entities.constants import DEFAULT_WORKSPACE +from nmp.common.jobs.constants import ( + DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH, + NEMO_JOB_ATTEMPT_ID_ENVVAR, + NEMO_JOB_ID_ENVVAR, + NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, + NEMO_JOB_STEP_ENVVAR, + NEMO_JOB_TASK_ENVVAR, + NEMO_JOB_WORKSPACE_ENVVAR, + PERSISTENT_JOB_STORAGE_PATH_ENVVAR, +) + + +class TestNMPJobContextFromEnv: + def test_uses_defaults_when_env_vars_not_set(self, monkeypatch: pytest.MonkeyPatch) -> None: + for var in ( + NEMO_JOB_WORKSPACE_ENVVAR, + NEMO_JOB_ID_ENVVAR, + NEMO_JOB_ATTEMPT_ID_ENVVAR, + NEMO_JOB_STEP_ENVVAR, + NEMO_JOB_TASK_ENVVAR, + NMP_JOBS_URL_ENVVAR, + NMP_FILES_URL_ENVVAR, + PERSISTENT_JOB_STORAGE_PATH_ENVVAR, + NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, + ): + monkeypatch.delenv(var, raising=False) + + ctx = NMPJobContext.from_env() + + assert ctx.workspace == DEFAULT_WORKSPACE + assert ctx.job_id == DEFAULT_JOB_ID + assert ctx.attempt_id == DEFAULT_ATTEMPT_ID + assert ctx.step == DEFAULT_STEP + assert ctx.task == DEFAULT_TASK + assert ctx.jobs_url is None + assert ctx.files_url is None + assert ctx.storage_path == Path(DEFAULT_JOB_STORAGE_PATH) + assert ctx.config_path == Path(DEFAULT_NEMO_JOB_STEP_CONFIG_FILE_PATH) + + def test_uses_env_vars_when_set(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv(NEMO_JOB_WORKSPACE_ENVVAR, "test-workspace") + monkeypatch.setenv(NEMO_JOB_ID_ENVVAR, "job-123") + monkeypatch.setenv(NEMO_JOB_ATTEMPT_ID_ENVVAR, "attempt-5") + monkeypatch.setenv(NEMO_JOB_STEP_ENVVAR, "training") + monkeypatch.setenv(NEMO_JOB_TASK_ENVVAR, "train-model") + monkeypatch.setenv(NMP_JOBS_URL_ENVVAR, "http://jobs.example.com") + monkeypatch.setenv(NMP_FILES_URL_ENVVAR, "http://files.example.com") + monkeypatch.setenv(PERSISTENT_JOB_STORAGE_PATH_ENVVAR, "/custom/storage") + monkeypatch.setenv(NEMO_JOB_STEP_CONFIG_FILE_PATH_ENVVAR, "/custom/config.json") + + ctx = NMPJobContext.from_env() + + assert ctx.workspace == "test-workspace" + assert ctx.job_id == "job-123" + assert ctx.normalized_task == "task-train-model" + assert ctx.jobs_url == "http://jobs.example.com" diff --git a/services/automodel/tests/test_platform_client.py b/services/automodel/tests/test_platform_client.py new file mode 100644 index 00000000..c33b52ca --- /dev/null +++ b/services/automodel/tests/test_platform_client.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from nmp.automodel.app.jobs.file_io.schemas import FileSetRef + + +def test_fileset_ref_parse() -> None: + ref = FileSetRef.model_validate("acme-corp/my-dataset") + assert ref.workspace == "acme-corp" + assert ref.name == "my-dataset" + + bare = FileSetRef.model_validate("my-dataset") + assert bare.workspace is None + assert bare.name == "my-dataset" diff --git a/services/automodel/tests/test_progress_reporter.py b/services/automodel/tests/test_progress_reporter.py new file mode 100644 index 00000000..ab7fc778 --- /dev/null +++ b/services/automodel/tests/test_progress_reporter.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from nemo_platform import omit +from nmp.automodel.app.jobs.context import NMPJobContext +from nmp.automodel.tasks.progress_reporter import JobsServiceProgressReporter +from nmp.common.jobs.schemas import PlatformJobStatus + + +def test_progress_reporter_calls_sdk_create_or_update() -> None: + sdk = MagicMock() + ctx = NMPJobContext( + workspace="ws-a", + job_id="job-1", + attempt_id="attempt-0", + step="training", + task="train-model", + jobs_url="http://jobs.example.com", + files_url=None, + storage_path=Path("/tmp/job"), + config_path=Path("/tmp/job/config.json"), + ) + reporter = JobsServiceProgressReporter(sdk, ctx.workspace, ctx.job_id, ctx.step, ctx.normalized_task) + reporter.update_progress(PlatformJobStatus.ACTIVE, status_details={"phase": "training"}) + + sdk.jobs.tasks.create_or_update.assert_called_once_with( + ctx.normalized_task, + workspace=ctx.workspace, + job=ctx.job_id, + step=ctx.step, + status=PlatformJobStatus.ACTIVE.value, + status_details={"phase": "training"}, + error_details=omit, + error_stack=omit, + ) diff --git a/services/automodel/tests/test_validators.py b/services/automodel/tests/test_validators.py new file mode 100644 index 00000000..a3f904d8 --- /dev/null +++ b/services/automodel/tests/test_validators.py @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import pytest +from nmp.automodel.entities.validators import validate_fileset_uri + + +def test_validate_fileset_workspace_name() -> None: + assert validate_fileset_uri("acme-corp/train-data") == "acme-corp/train-data" + + +def test_validate_fileset_bare_name() -> None: + assert validate_fileset_uri("train-data") == "train-data" + + +def test_validate_strips_legacy_fileset_prefix() -> None: + assert validate_fileset_uri("fileset://acme-corp/train-data") == "acme-corp/train-data" + + +def test_validate_rejects_hf_protocol() -> None: + with pytest.raises(ValueError, match="Unsupported"): + validate_fileset_uri("hf://org/dataset") diff --git a/services/core/auth/scripts/auth-tools.py b/services/core/auth/scripts/auth-tools.py index ef2ecc36..c0076d8c 100755 --- a/services/core/auth/scripts/auth-tools.py +++ b/services/core/auth/scripts/auth-tools.py @@ -1306,5 +1306,76 @@ def extract_role_permissions_recursive( return perms +@app.command("sync-plugins") +def sync_plugins( + auth_path: Path = typer.Option( + None, "--auth", "-a", help="Path to static authorization configuration file (relative to project root)" + ), + dry_run: bool = typer.Option(False, "--dry-run", "-n", help="Show what would change without writing"), +): + """Merge plugin ``nemo.authz`` / ``get_authz_contribution`` data into static-authz.yaml. + + Run from the repo root with workspace plugins installed (``uv sync``). This + materializes runtime plugin policy into the committed bundle for CI and + environments that only load ``static-authz.yaml``. + """ + project_root = get_project_root() + if auth_path is None: + auth_path = ( + project_root + / "services" + / "core" + / "auth" + / "src" + / "nmp" + / "core" + / "auth" + / "assets" + / "static-authz.yaml" + ) + else: + auth_path = project_root / auth_path + + try: + from nemo_platform_plugin.authz_discovery import discover_authz_contribution_dicts + from nmp.common.auth.authz_merge import merge_authz_contributions + except ImportError as exc: + console.print(f"[red]Cannot import plugin authz discovery: {exc}[/red]") + console.print("[yellow]Run from repo root with workspace packages installed (uv sync).[/yellow]") + raise typer.Exit(code=1) from exc + + contributions = discover_authz_contribution_dicts() + if not contributions: + console.print("[yellow]No plugin authz contributions discovered.[/yellow]") + raise typer.Exit(code=0) + + auth_config = load_yaml(auth_path) + before_endpoints = set(auth_config.get("authz", {}).get("endpoints", {}).keys()) + merged = merge_authz_contributions(auth_config, contributions) + after_endpoints = set(merged.get("authz", {}).get("endpoints", {}).keys()) + added_paths = sorted(after_endpoints - before_endpoints) + + console.print(f"[bold]Merging {len(contributions)} plugin authz contribution(s)...[/bold]") + for path in added_paths: + methods = sorted(merged["authz"]["endpoints"][path].keys()) + console.print(f" [green]+[/green] {path} ({', '.join(methods)})") + + if not added_paths: + console.print("[dim]No new endpoint paths (contributions may already be present).[/dim]") + + if dry_run: + console.print("[dim]Dry run — not writing file.[/dim]") + return + + sorted_endpoints = {} + for path in sorted(merged["authz"]["endpoints"].keys()): + sorted_methods = dict(sorted(merged["authz"]["endpoints"][path].items())) + sorted_endpoints[path] = sorted_methods + merged["authz"]["endpoints"] = sorted_endpoints + + save_yaml(auth_path, merged) + console.print(f"[green]✓ Updated {auth_path}[/green]") + + if __name__ == "__main__": app() diff --git a/services/core/auth/src/nmp/core/auth/app/bundle.py b/services/core/auth/src/nmp/core/auth/app/bundle.py index b19e58c3..3a36474f 100644 --- a/services/core/auth/src/nmp/core/auth/app/bundle.py +++ b/services/core/auth/src/nmp/core/auth/app/bundle.py @@ -8,6 +8,7 @@ import hashlib import io import json +import logging import tarfile import time from pathlib import Path @@ -21,6 +22,8 @@ from nmp.core.auth.config import AuthServiceConfig from nmp.core.auth.entities import RoleBindingEntity +logger = logging.getLogger(__name__) + # Bundle cache configuration _bundle_cache: Optional[Tuple[bytes, str, float]] = None # (bundle_bytes, etag, timestamp) _bundle_lock = asyncio.Lock() @@ -80,6 +83,23 @@ async def get_opa_bundle_with_etag(entities_client: Optional[EntityClient] = Non return bundle_bytes, etag +def _merge_plugin_authz_contributions(static_data: dict) -> dict: + """Overlay authorization rules from installed NeMo Platform plugins.""" + try: + from nemo_platform_plugin.authz_discovery import discover_authz_contribution_dicts + from nmp.common.auth.authz_merge import merge_authz_contributions + except ImportError: + logger.debug("Plugin authz discovery unavailable; using static authz only") + return static_data + + contributions = discover_authz_contribution_dicts() + if not contributions: + return static_data + + logger.debug("Merging %d plugin authz contribution(s)", len(contributions)) + return merge_authz_contributions(static_data, contributions) + + async def _build_authorization_data_internal(entities_client: Optional[EntityClient] = None) -> dict: """Build authorization data for NeMo Platform. @@ -102,6 +122,7 @@ async def _build_authorization_data_internal(entities_client: Optional[EntityCli with open(static_data_path, "r") as f: static_data = yaml.safe_load(f) + static_data = _merge_plugin_authz_contributions(static_data) validate_static_authz_data(static_data) # Initialize workspaces and principals if not present diff --git a/services/core/entities/config/local.env b/services/core/entities/config/local.env index 7283e036..1fb53eda 100644 --- a/services/core/entities/config/local.env +++ b/services/core/entities/config/local.env @@ -1,9 +1,4 @@ NMP_IMAGE_REGISTRY=my-registry NMP_IMAGE_TAG=local -DATABASE_NAME=entities -DATABASE_DIALECT=postgresql -DATABASE_USER=nmp -DATABASE_PASSWORD=nmp -DATABASE_HOST=localhost -DATABASE_PORT=5432 - +DATABASE_DIALECT=sqlite +DATABASE_PATH="${HOME}/.local/share/nemo/nmp-platform.db" diff --git a/services/core/jobs/src/nmp/core/jobs/config.py b/services/core/jobs/src/nmp/core/jobs/config.py index ce2d30ee..3c71a0fb 100644 --- a/services/core/jobs/src/nmp/core/jobs/config.py +++ b/services/core/jobs/src/nmp/core/jobs/config.py @@ -5,7 +5,7 @@ from typing import Self -from nmp.common.config import create_service_config_class, get_platform_config, get_service_config +from nmp.common.config import Runtime, create_service_config_class, get_platform_config, get_service_config from nmp.core.jobs.app.profiles import ExecutionProfileT from nmp.core.jobs.controllers.backends.config import ( DefaultExecutionProfileConfig, @@ -30,6 +30,19 @@ class JobsServiceConfig(create_service_config_class("jobs")): # type: ignore ) reconcile_interval_seconds: int = Field(default=2, description="Interval in seconds for the job reconciler to run") schedule_interval_seconds: int = Field(default=5, description="Interval in seconds for the job scheduler to run") + enable_subprocess_executor: bool | None = Field( + default=None, + description=( + "Register the subprocess/default execution profile. When unset, defaults to true for " + "docker/none runtimes and false for kubernetes." + ), + ) + + def resolved_enable_subprocess_executor(self) -> bool: + """Whether host subprocess execution is registered for default profiles.""" + if self.enable_subprocess_executor is not None: + return self.enable_subprocess_executor + return get_platform_config().runtime != Runtime.KUBERNETES @model_validator(mode="after") def validate_executors(self) -> Self: @@ -55,5 +68,6 @@ def validate_executors(self) -> Self: get_default_executor_profiles_for_runtime( runtime=get_platform_config().runtime, defaults=config.executor_defaults, + enable_subprocess_executor=config.resolved_enable_subprocess_executor(), ), ) diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py index 0e137b8a..f5d4724f 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py @@ -41,8 +41,14 @@ class DefaultExecutionProfileConfig(BaseModel): ) -def get_default_executor_profiles_for_runtime(runtime: Runtime, defaults: DefaultExecutionProfileConfig) -> list: +def get_default_executor_profiles_for_runtime( + runtime: Runtime, + defaults: DefaultExecutionProfileConfig, + enable_subprocess_executor: bool | None = None, +) -> list: """Returns a list of default executor profiles based on the deployment runtime.""" + if enable_subprocess_executor is None: + enable_subprocess_executor = runtime != Runtime.KUBERNETES logger.debug("Getting default executors for runtime: %s", runtime) executors = [] @@ -87,9 +93,7 @@ def get_default_executor_profiles_for_runtime(runtime: Runtime, defaults: Defaul ] ) - # Subprocess execution is available for single-host runtimes only. Kubernetes deployments must opt in - # explicitly so subprocess profiles do not appear on distributed service pods by default. - if runtime != Runtime.KUBERNETES: + if enable_subprocess_executor: executors.append( SubprocessJobExecutionProfile( provider="subprocess", diff --git a/services/core/models/pyproject.toml b/services/core/models/pyproject.toml index c58a342f..203ddb50 100644 --- a/services/core/models/pyproject.toml +++ b/services/core/models/pyproject.toml @@ -37,8 +37,8 @@ packages = ["src/nmp"] [dependency-groups] # No task deps for models — `nmp.core.models.parallelism` is the only consumer of # torch/transformers/accelerate, and it's only invoked from the `model_spec` batch -# task. That task runs in the `customizer-tasks` image, which installs torch -# itself; parallelism tests in this repo guard with `pytest.importorskip("torch")`. +# task. That task runs in the `nmp-automodel-tasks` image (PyTorch from +# nmp-automodel-base); parallelism tests in this repo guard with `pytest.importorskip("torch")`. dev = [ "pytest>=8.3.4", diff --git a/services/core/models/src/nmp/core/models/api/v2/models.py b/services/core/models/src/nmp/core/models/api/v2/models.py index 8e4637ff..343a94e4 100644 --- a/services/core/models/src/nmp/core/models/api/v2/models.py +++ b/services/core/models/src/nmp/core/models/api/v2/models.py @@ -270,8 +270,9 @@ async def start_update_model_spec_job(model_entity: ModelEntity): executor=CPUExecutionProviderSpec( provider="cpu", container=ContainerSpec( - image=get_qualified_image("customizer-tasks"), - command=["nemo-platform", "run", "task", "--task", "nmp.core.models.tasks.model_spec"], + image=get_qualified_image("nmp-automodel-tasks"), + entrypoint=["/opt/venv/bin/python"], + command=["-m", "nmp.core.models.tasks.model_spec"], ), resources=ResourcesSpec( requests=ResourcesRequestsSpec( diff --git a/tests/customizer-automodel-contract/generate_configs.py b/tests/customizer-automodel-contract/generate_configs.py index 49979217..65a8b9a5 100644 --- a/tests/customizer-automodel-contract/generate_configs.py +++ b/tests/customizer-automodel-contract/generate_configs.py @@ -48,17 +48,20 @@ SCRIPT_DIR = Path(__file__).resolve().parent REPO_ROOT = SCRIPT_DIR.parent.parent +AUTOMODEL_SRC = REPO_ROOT / "services" / "automodel" / "src" CUSTOMIZER_SRC = REPO_ROOT / "services" / "customizer" / "src" -if CUSTOMIZER_SRC.is_dir(): +if AUTOMODEL_SRC.is_dir(): + sys.path.insert(0, str(AUTOMODEL_SRC)) +elif CUSTOMIZER_SRC.is_dir(): sys.path.insert(0, str(CUSTOMIZER_SRC)) else: - sys.path.insert(0, "/app/services/customizer/src") + sys.path.insert(0, "/app/services/automodel/src") -from nmp.customizer.app.constants import V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES # noqa: E402 -from nmp.customizer.app.jobs.context import NMPJobContext # noqa: E402 -from nmp.customizer.tasks.training.backends.automodel.config import compile_automodel_config # noqa: E402 -from nmp.customizer.tasks.training.schemas import TrainingStepConfig # noqa: E402 +from nmp.automodel.app.constants import V4_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES # noqa: E402 +from nmp.automodel.app.jobs.context import NMPJobContext # noqa: E402 +from nmp.automodel.tasks.training.backends.config import compile_automodel_config # noqa: E402 +from nmp.automodel.tasks.training.schemas import TrainingStepConfig # noqa: E402 INPUT_DIR = SCRIPT_DIR / "input_configs" OUTPUT_DIR = SCRIPT_DIR / "output_configs" diff --git a/tests/smoke_gpu/conftest.py b/tests/smoke_gpu/conftest.py index ae32d9c1..6e0c602b 100644 --- a/tests/smoke_gpu/conftest.py +++ b/tests/smoke_gpu/conftest.py @@ -9,3 +9,9 @@ def pytest_configure(config): "markers", "smoke_customizer_automodel: Import smoke tests for the customizer-automodel image" ) config.addinivalue_line("markers", "smoke_customizer_rl: Import smoke tests for the customizer-rl image") + config.addinivalue_line( + "markers", "smoke_nmp_automodel_tasks: Import smoke tests for the nmp/automodel-tasks image" + ) + config.addinivalue_line( + "markers", "smoke_nmp_automodel_training: Import smoke tests for the nmp/automodel-training image" + ) diff --git a/tests/smoke_gpu/test_nemo_automodel.py b/tests/smoke_gpu/test_nemo_automodel.py new file mode 100644 index 00000000..f9d80638 --- /dev/null +++ b/tests/smoke_gpu/test_nemo_automodel.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""NeMo Automodel image import smoke tests. + +Built as part of the nmp-automodel docker bake group (smoke-test stage) and run +on a CPU runner - no GPU hardware required. +""" + +import pytest + + +def test_torch_importable(): + import torch # noqa: F401 + + +def test_transformers_importable(): + import transformers # noqa: F401 + + +def test_mamba_ssm_importable(): + import mamba_ssm # noqa: F401 + + +def test_causal_conv1d_importable(): + import causal_conv1d # noqa: F401 + + +def test_bitsandbytes_importable(): + import bitsandbytes # noqa: F401 + + +@pytest.mark.smoke_nmp_automodel_tasks +def test_nmp_automodel_tasks_importable(): + from nmp.automodel.tasks import file_io # noqa: F401 + from nmp.automodel.tasks.model_entity import __main__ as model_entity_main # noqa: F401 + from nmp.core.models.tasks.model_spec import __main__ as model_spec_main # noqa: F401 + + +@pytest.mark.smoke_nmp_automodel_training +def test_nmp_automodel_training_importable(): + import nemo_automodel # noqa: F401 + from nmp.automodel.tasks.training import __main__ as training_main # noqa: F401 diff --git a/third_party/licenses.jsonl b/third_party/licenses.jsonl index 8405ae9f..00293e88 100644 --- a/third_party/licenses.jsonl +++ b/third_party/licenses.jsonl @@ -268,6 +268,7 @@ {"name": "ruff", "license": "MIT", "compatible": true} {"name": "s3transfer", "license": "APACHE-2.0", "compatible": true} {"name": "sacrebleu", "license": "APACHE-2.0", "compatible": true} +{"name": "safetensors", "license": "APACHE-2.0", "compatible": true} {"name": "scikit-network", "license": "BSD-3-CLAUSE", "compatible": true} {"name": "scipy", "license": "BSD-3-CLAUSE", "compatible": true} {"name": "secretstorage", "license": "BSD-3-CLAUSE", "compatible": true} @@ -297,6 +298,7 @@ {"name": "tomlkit", "license": "MIT", "compatible": true} {"name": "tornado", "license": "APACHE-2.0", "compatible": true} {"name": "tqdm", "license": "MIT", "compatible": true} +{"name": "transformers", "license": "APACHE-2.0", "compatible": true} {"name": "typer", "license": "MIT", "compatible": true} {"name": "types-aioboto3", "license": "MIT", "compatible": true} {"name": "types-aiobotocore", "license": "MIT", "compatible": true} diff --git a/third_party/osv-licenses.json b/third_party/osv-licenses.json index 7a72e081..fe4d13a4 100644 --- a/third_party/osv-licenses.json +++ b/third_party/osv-licenses.json @@ -1916,7 +1916,7 @@ { "package": { "name": "nvidia-nat-atif", - "version": "1.7.0rc3", + "version": "1.7.0", "ecosystem": "PyPI" }, "licenses": [ @@ -1926,7 +1926,7 @@ { "package": { "name": "nvidia-nat-config-optimizer", - "version": "1.7.0rc3", + "version": "1.7.0", "ecosystem": "PyPI" }, "licenses": [ @@ -1936,7 +1936,7 @@ { "package": { "name": "nvidia-nat-core", - "version": "1.7.0rc3", + "version": "1.7.0", "ecosystem": "PyPI" }, "licenses": [ @@ -1946,7 +1946,7 @@ { "package": { "name": "nvidia-nat-eval", - "version": "1.7.0rc3", + "version": "1.7.0", "ecosystem": "PyPI" }, "licenses": [ @@ -1956,7 +1956,7 @@ { "package": { "name": "nvidia-nat-langchain", - "version": "1.7.0rc3", + "version": "1.7.0", "ecosystem": "PyPI" }, "licenses": [ @@ -1966,7 +1966,7 @@ { "package": { "name": "nvidia-nat-opentelemetry", - "version": "1.7.0rc3", + "version": "1.7.0", "ecosystem": "PyPI" }, "licenses": [ @@ -3099,6 +3099,16 @@ "Apache-2.0" ] }, + { + "package": { + "name": "safetensors", + "version": "0.8.0rc0", + "ecosystem": "PyPI" + }, + "licenses": [ + "non-standard" + ] + }, { "package": { "name": "scikit-network", @@ -3242,7 +3252,7 @@ { "package": { "name": "sqlfluff", - "version": "3.4.0", + "version": "4.1.0", "ecosystem": "PyPI" }, "vulnerabilities": [ @@ -3461,29 +3471,69 @@ "nvd_published_at": null, "severity": "HIGH" } - }, + } + ], + "groups": [ { - "modified": "2026-05-19T20:15:16Z", - "published": "2026-05-19T20:10:17Z", - "schema_version": "1.7.5", - "id": "GHSA-wmhf-fqc8-vxhh", + "ids": [ + "GHSA-73jc-5mrq-prw7" + ], "aliases": [ - "CVE-2026-46373" + "CVE-2026-46374", + "GHSA-73jc-5mrq-prw7" ], - "summary": "SQLFluff: Recursive Stack Overflow in Parser", - "details": "### Impact\n\nIn deployments where untrusted users can provide SQL queries to be linted, an untrusted user can submit a malicious query with deliberate excessive nesting to any application using the parser to trigger a Denial of Service through resource exhaustion.\n\n### Patches\n\nVersions 4.1.0 and up contain a configurable recursion limit, which is enabled by default, to prevent this manner of exploit.\n\n### Credit\n\nOri Nakar from Imperva Threat Research Team.", - "severity": [ - { - "type": "CVSS_V3", - "score": "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H" - } + "max_severity": "7.5" + } + ], + "licenses": [ + "non-standard" + ] + }, + { + "package": { + "name": "sqlmodel", + "version": "0.0.37", + "ecosystem": "PyPI" + }, + "licenses": [ + "MIT" + ] + }, + { + "package": { + "name": "sse-starlette", + "version": "3.3.4", + "ecosystem": "PyPI" + }, + "licenses": [ + "BSD-3-Clause" + ] + }, + { + "package": { + "name": "starlette", + "version": "0.52.1", + "ecosystem": "PyPI" + }, + "vulnerabilities": [ + { + "modified": "2026-05-26T15:41:31Z", + "published": "2026-05-22T13:11:38Z", + "schema_version": "1.7.5", + "id": "PYSEC-2026-161", + "aliases": [ + "CVE-2026-48710", + "GHSA-86qp-5c8j-p5mr", + "X41-2026-002" ], + "summary": "BadHost: Missing Host header validation poisons request.url.path, bypassing path-based security checks", + "details": "Starlette reconstructs the requested URL based on the HTTP Host request header and requested path, but does not perform any validation of the Host header value. This allows attackers to inject paths into the host part, prepending the actual path. However, routing in Starlette is based on the actual request path. This inconsistent interpretation of HTTP requests may lead to issues such as authentication bypass when the authentication depends on the reconstructed URL\u2019s path.", "affected": [ { "package": { "ecosystem": "PyPI", - "name": "sqlfluff", - "purl": "pkg:pypi/sqlfluff" + "name": "starlette", + "purl": "pkg:pypi/starlette" }, "ranges": [ { @@ -3493,243 +3543,259 @@ "introduced": "0" }, { - "fixed": "4.1.0" + "fixed": "1.0.1" } ] } ], "versions": [ - "0.0.1", - "0.0.2", - "0.0.3", - "0.0.4", - "0.0.5", - "0.0.6", - "0.0.7", - "0.0.9", "0.1.0", "0.1.1", + "0.1.10", + "0.1.11", + "0.1.12", + "0.1.13", + "0.1.14", + "0.1.15", + "0.1.16", + "0.1.17", "0.1.2", "0.1.3", "0.1.4", "0.1.5", + "0.1.6", + "0.1.7", + "0.1.8", + "0.1.9", "0.10.0", "0.10.1", + "0.10.2", + "0.10.3", + "0.10.4", + "0.10.5", + "0.10.6", + "0.10.7", "0.11.0", "0.11.1", "0.11.2", + "0.11.3", + "0.11.4", "0.12.0", + "0.12.0b1", + "0.12.0b2", + "0.12.0b3", + "0.12.1", + "0.12.10", + "0.12.11", + "0.12.12", + "0.12.13", + "0.12.2", + "0.12.3", + "0.12.4", + "0.12.5", + "0.12.6", + "0.12.7", + "0.12.8", + "0.12.9", "0.13.0", "0.13.1", "0.13.2", + "0.13.3", + "0.13.4", + "0.13.5", + "0.13.6", + "0.13.7", + "0.13.8", + "0.14.0", + "0.14.1", + "0.14.2", + "0.15.0", + "0.16.0", + "0.17.0", + "0.17.1", + "0.18.0", + "0.19.0", + "0.19.1", "0.2.0", "0.2.1", "0.2.2", "0.2.3", - "0.2.4", + "0.20.0", + "0.20.1", + "0.20.2", + "0.20.3", + "0.20.4", + "0.21.0", + "0.22.0", + "0.23.0", + "0.23.1", + "0.24.0", + "0.25.0", + "0.26.0", + "0.26.0.post1", + "0.26.1", + "0.27.0", + "0.28.0", + "0.29.0", "0.3.0", "0.3.1", "0.3.2", - "0.3.2.post1", - "0.3.2.post2", "0.3.3", "0.3.4", "0.3.5", "0.3.6", + "0.3.7", + "0.30.0", + "0.31.0", + "0.31.1", + "0.32.0", + "0.32.0.post1", + "0.33.0", + "0.34.0", + "0.35.0", + "0.35.1", + "0.36.0", + "0.36.1", + "0.36.2", + "0.36.3", + "0.37.0", + "0.37.1", + "0.37.2", + "0.38.0", + "0.38.1", + "0.38.2", + "0.38.3", + "0.38.4", + "0.38.5", + "0.38.6", + "0.39.0", + "0.39.1", + "0.39.2", "0.4.0", - "0.4.0a1", - "0.4.0a2", - "0.4.0a3", "0.4.1", + "0.4.2", + "0.40.0", + "0.41.0", + "0.41.1", + "0.41.2", + "0.41.3", + "0.42.0", + "0.43.0", + "0.44.0", + "0.45.0", + "0.45.1", + "0.45.2", + "0.45.3", + "0.46.0", + "0.46.1", + "0.46.2", + "0.47.0", + "0.47.1", + "0.47.2", + "0.47.3", + "0.48.0", + "0.49.0", + "0.49.1", + "0.49.2", + "0.49.3", "0.5.0", "0.5.1", "0.5.2", "0.5.3", "0.5.4", "0.5.5", - "0.5.6", + "0.50.0", + "0.51.0", + "0.52.0", + "0.52.1", "0.6.0", - "0.6.0a1", - "0.6.0a2", "0.6.1", "0.6.2", "0.6.3", - "0.6.4", - "0.6.5", - "0.6.6", - "0.6.7", - "0.6.8", - "0.6.9", "0.7.0", - "0.7.0a1", - "0.7.0a2", - "0.7.0a3", - "0.7.0a5", - "0.7.0a8", "0.7.1", + "0.7.2", + "0.7.3", + "0.7.4", "0.8.0", "0.8.1", "0.8.2", + "0.8.3", + "0.8.4", + "0.8.5", + "0.8.6", + "0.8.7", + "0.8.8", "0.9.0", "0.9.1", + "0.9.10", + "0.9.11", "0.9.2", "0.9.3", "0.9.4", + "0.9.5", + "0.9.6", + "0.9.7", + "0.9.8", + "0.9.9", "1.0.0", - "1.1.0", - "1.2.0", - "1.2.1", - "1.3.0", - "1.3.1", - "1.3.2", - "1.4.0", - "1.4.1", - "1.4.2", - "1.4.3", - "1.4.4", - "1.4.5", - "2.0.0", - "2.0.0a1", - "2.0.0a2", - "2.0.0a3", - "2.0.0a4", - "2.0.0a5", - "2.0.0a6", - "2.0.1", - "2.0.2", - "2.0.3", - "2.0.4", - "2.0.5", - "2.0.6", - "2.0.7", - "2.1.0", - "2.1.1", - "2.1.2", - "2.1.3", - "2.1.4", - "2.2.0", - "2.2.1", - "2.3.0", - "2.3.1", - "2.3.2", - "2.3.3", - "2.3.4", - "2.3.5", - "3.0.0", - "3.0.0a1", - "3.0.0a2", - "3.0.0a3", - "3.0.0a4", - "3.0.0a5", - "3.0.0a6", - "3.0.1", - "3.0.2", - "3.0.3", - "3.0.4", - "3.0.5", - "3.0.6", - "3.0.7", - "3.1.0", - "3.1.1", - "3.2.0", - "3.2.1", - "3.2.2", - "3.2.3", - "3.2.4", - "3.2.5", - "3.3.0", - "3.3.1", - "3.4.0", - "3.4.1", - "3.4.2", - "3.5.0", - "4.0.0", - "4.0.0a1", - "4.0.0a2", - "4.0.0a3", - "4.0.1.post1", - "4.0.3", - "4.0.4", - "4.0.4a1" + "1.0.0rc1" ], "database_specific": { - "source": "https://github.com/github/advisory-database/blob/main/advisories/github-reviewed/2026/05/GHSA-wmhf-fqc8-vxhh/GHSA-wmhf-fqc8-vxhh.json" + "source": "https://github.com/pypa/advisory-database/blob/main/vulns/starlette/PYSEC-2026-161.yaml" } } ], "references": [ { - "type": "WEB", - "url": "https://github.com/sqlfluff/sqlfluff/security/advisories/GHSA-wmhf-fqc8-vxhh" + "type": "ARTICLE", + "url": "https://badhost.org" }, { - "type": "PACKAGE", - "url": "https://github.com/sqlfluff/sqlfluff" + "type": "ARTICLE", + "url": "https://www.secwest.net/starlette" + }, + { + "type": "ARTICLE", + "url": "https://ostif.org/disclosing-the-badhost-vulnerability-in-starlette/" + }, + { + "type": "DETECTION", + "url": "https://badhost.org" + }, + { + "type": "EVIDENCE", + "url": "https://github.com/Kludex/starlette/security/advisories/GHSA-86qp-5c8j-p5mr" + }, + { + "type": "ADVISORY", + "url": "https://github.com/Kludex/starlette/security/advisories/GHSA-86qp-5c8j-p5mr" + }, + { + "type": "ADVISORY", + "url": "https://www.x41-dsec.de/lab/advisories/x41-2026-002-starlette/" + }, + { + "type": "FIX", + "url": "https://github.com/Kludex/starlette/commit/764dab0dcfb9033d75442d7a359645c9f94648c6" } - ], - "database_specific": { - "cwe_ids": [ - "CWE-674" - ], - "github_reviewed": true, - "github_reviewed_at": "2026-05-19T20:10:17Z", - "nvd_published_at": null, - "severity": "HIGH" - } + ] } ], "groups": [ { "ids": [ - "GHSA-73jc-5mrq-prw7" + "PYSEC-2026-161" ], "aliases": [ - "CVE-2026-46374", - "GHSA-73jc-5mrq-prw7" - ], - "max_severity": "7.5" - }, - { - "ids": [ - "GHSA-wmhf-fqc8-vxhh" + "CVE-2026-48710", + "GHSA-86qp-5c8j-p5mr", + "PYSEC-2026-161", + "X41-2026-002" ], - "aliases": [ - "CVE-2026-46373", - "GHSA-wmhf-fqc8-vxhh" - ], - "max_severity": "7.5" + "max_severity": "" } ], - "licenses": [ - "non-standard" - ] - }, - { - "package": { - "name": "sqlmodel", - "version": "0.0.37", - "ecosystem": "PyPI" - }, - "licenses": [ - "MIT" - ] - }, - { - "package": { - "name": "sse-starlette", - "version": "3.3.4", - "ecosystem": "PyPI" - }, - "licenses": [ - "BSD-3-Clause" - ] - }, - { - "package": { - "name": "starlette", - "version": "0.52.1", - "ecosystem": "PyPI" - }, "licenses": [ "BSD-3-Clause" ] @@ -3844,6 +3910,16 @@ "MIT AND MPL-2.0" ] }, + { + "package": { + "name": "transformers", + "version": "5.3.0", + "ecosystem": "PyPI" + }, + "licenses": [ + "Apache-2.0" + ] + }, { "package": { "name": "typer", @@ -4344,11 +4420,11 @@ }, { "name": "Apache-2.0", - "count": 84 + "count": 85 }, { "name": "non-standard", - "count": 47 + "count": 48 }, { "name": "BSD-3-Clause", diff --git a/third_party/requirements-main.txt b/third_party/requirements-main.txt index 2cf07c01..39b1184c 100644 --- a/third_party/requirements-main.txt +++ b/third_party/requirements-main.txt @@ -25,6 +25,8 @@ # nemo-agents-plugin # nemo-anonymizer-plugin # nemo-auditor-plugin + # nemo-automodel-plugin + # nemo-customizer-plugin # nemo-data-designer-plugin # nemo-evaluator-plugin # nemo-guardrails-plugin @@ -36,6 +38,8 @@ # nemo-agents-plugin # nemo-anonymizer-plugin # nemo-auditor-plugin + # nemo-automodel-plugin + # nemo-customizer-plugin # nemo-data-designer-plugin # nemo-evaluator-plugin # nemo-guardrails-plugin @@ -55,6 +59,7 @@ # nemo-platform # nemoplatform # nmp-auth + # nmp-automodel # nmp-core-mcp # nmp-customizer # nmp-entities @@ -83,6 +88,8 @@ # via nemo-agents-plugin -e ./plugins/nemo-anonymizer ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') -e ./plugins/nemo-auditor ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') +-e ./plugins/nemo-automodel ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') +-e ./plugins/nemo-customizer ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') -e ./plugins/nemo-data-designer ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') -e ./plugins/nemo-evaluator ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') -e ./plugins/nemo-guardrails ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') @@ -95,9 +102,12 @@ # nemo-platform # nemo-platform-ext # nemo-platform-plugin + # nmp-automodel # nmp-common # nmp-core-mcp # nmp-entities +-e ./services/automodel ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') + # via nemo-automodel-plugin -e ./services/core/auth ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') # via # nemoplatform @@ -174,6 +184,7 @@ aiofiles==25.1.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or # aioboto3 # nemoguardrails # ngcsdk + # nmp-automodel # nmp-common # nmp-evaluator # nvidia-nat-core @@ -407,6 +418,7 @@ boto3==1.40.61 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or ( # via # aiobotocore # langchain-aws + # nemo-agents-plugin # ngcsdk botocore==1.40.61 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:17ebae412692fd4824f99cde0f08d50126dc97954008e5ba2b522eb049238aa7 \ @@ -414,6 +426,7 @@ botocore==1.40.61 ; (platform_machine == 'arm64' and sys_platform == 'darwin') o # via # aiobotocore # boto3 + # nemo-agents-plugin # ngcsdk # s3transfer botocore-stubs==1.42.41 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ @@ -706,11 +719,15 @@ data-designer-engine==0.6.0 ; (platform_machine == 'arm64' and sys_platform == ' dataclasses-json==0.6.7 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:0dbf33f26c8d5305befd61b39d2b3414e8a407bedc2834dea9b8d642666fb40a \ --hash=sha256:b6b3e528266ea45b9535223bc53ca645f5208833c29229e847b3f26a1cc55fc0 - # via langchain-community + # via + # langchain-community + # nemo-guardrails-plugin + # nmp-guardrails datasets==4.0.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:7ef95e62025fd122882dbce6cb904c8cd3fbc829de6669a5eb939c77d50e203d \ --hash=sha256:9657e7140a9050db13443ba21cb5de185af8af944479b00e7ff1e00a61c8dbf1 # via + # nemo-customizer-plugin # nemo-safe-synthesizer # nmp-evaluator # ragas @@ -787,7 +804,6 @@ duckdb==1.5.1 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (p # via # data-designer-engine # data-designer-nemo - # nemo-data-designer-plugin # nmp-files # nmp-jobs durationpy==0.10 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ @@ -1232,6 +1248,7 @@ httpx==0.28.1 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (p # nemo-platform-sdk # nemoguardrails # nmp-auth + # nmp-automodel # nmp-guardrails # nmp-intake # nmp-safe-synthesizer @@ -1262,6 +1279,7 @@ huggingface-hub==1.15.0 ; (platform_machine == 'arm64' and sys_platform == 'darw # nmp-evaluator # nmp-intake # tokenizers + # transformers humanize==4.15.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:1dd098483eb1c7ee8e32eb2e99ad1910baefa4b75c3aff3a82f4d78688993b10 \ --hash=sha256:b1186eb9f5a9749cd9cb8565aee77919dd7c8d076161cf44d70e59e3301e1769 @@ -1486,6 +1504,7 @@ jsonschema==4.23.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') # mcp # nemo-evaluator-sdk # nemo-safe-synthesizer + # nmp-automodel jsonschema-path==0.3.4 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:8365356039f16cc65fddffafda5f58766e34bebab7d6d105616ab52bc4297001 \ --hash=sha256:f502191fdc2b22050f9a81c9237be9d27145b9001c55842bece5e94e382e52f8 @@ -1524,7 +1543,9 @@ langchain==1.2.14 ; (platform_machine == 'arm64' and sys_platform == 'darwin') o langchain-aws==1.1.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:1e2f8570328eae4907c3cf7e900dc68d8034ddc865d9dc96823c9f9d8cccb901 \ --hash=sha256:8ec074615b42839e035354063717374c32c63f5028ef5221ba073fd5f3ef5e37 - # via nvidia-nat-langchain + # via + # nemo-agents-plugin + # nvidia-nat-langchain langchain-classic==1.0.7 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:d9d9be38f7aa534ed0259c2410432e34a1f80b1d491e686749bb55af56479be3 \ --hash=sha256:debbec8065e69b95108d2652e8d5c44f4516e19aa8d716c02ed2211c3aee099d @@ -2101,6 +2122,7 @@ numpy==2.4.4 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (pl # sacrebleu # scikit-network # scipy + # transformers nvidia-ml-py==13.595.45 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:b65a7977f503d56154b14d683710125ef93594adb63fbf7e559336e3318f1376 \ --hash=sha256:c9f34897fe0441ff35bc8f35baf80f830a20b0f4e6ce71e0a325bc0e66acf079 @@ -2108,37 +2130,33 @@ nvidia-ml-py==13.595.45 ; (platform_machine == 'arm64' and sys_platform == 'darw # nemo-platform-ext # nemo-platform-sdk # nmp-common -nvidia-nat-atif==1.7.0rc3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ - --hash=sha256:b95979e24aae0ba07277ec20e993ae910278b2bd0b00ce3415037f02694a1ebb +nvidia-nat-atif==1.7.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:65d471a366dfafe75cf94428bdba3007bd3e1368487e2e16fd62484298635334 # via - # nemo-agents-plugin # nvidia-nat-core # nvidia-nat-eval -nvidia-nat-config-optimizer==1.7.0rc3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ - --hash=sha256:a0e04114fc85df980b96766dec366bf4c7a2e08e6aaa4ce1531dd1bc6dadfcde +nvidia-nat-config-optimizer==1.7.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:5b14cb63d1076f314f63039a4f659c0d4129f20b2bedc38b026ceee53da9e8f1 # via nemo-agents-plugin -nvidia-nat-core==1.7.0rc3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ - --hash=sha256:590ae4bf36f79068605806de2044429f29001a206d31a7297a4f1dacbedf9eb1 +nvidia-nat-core==1.7.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:fb4691ad3437e0e8b8d84256d36da18085ba1fc6dd47861e7a7e64c6a808390c # via # nemo-agents-example-calculator # nemo-agents-plugin # nvidia-nat-config-optimizer # nvidia-nat-langchain # nvidia-nat-opentelemetry -nvidia-nat-eval==1.7.0rc3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ - --hash=sha256:44adcc9f8534b3a0abfde521aabc1e450ea52ed3d57381fcaf9ace7cf67b15c4 +nvidia-nat-eval==1.7.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:724fa6410a7a66050b525d75660d7fb753ed83aba32c37d31111ae322d54a548 # via - # nemo-agents-plugin # nvidia-nat-config-optimizer # nvidia-nat-langchain -nvidia-nat-langchain==1.7.0rc3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ - --hash=sha256:544d52d0e6d6ae8c6b9da11ee5b471ba119e1704b98ed1e3119ab18c88c0d87c +nvidia-nat-langchain==1.7.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:e00812f9d2c602bb59cc006081a5deef2e71cc46357e70f51503780196f9cc20 # via nemo-agents-plugin -nvidia-nat-opentelemetry==1.7.0rc3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ - --hash=sha256:dbe77666b0c8143291995a76a591441c243d41a3db5c1b79560d79a0885fed2c - # via - # nemo-agents-plugin - # nvidia-nat-langchain +nvidia-nat-opentelemetry==1.7.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:e468fb4bd2a9e2fa4fb0d67daa1dea22b900c70301249d2785aa96f305808f7f + # via nvidia-nat-langchain oauthlib==3.3.1 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9 \ --hash=sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1 @@ -2469,6 +2487,7 @@ packaging==26.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or # opentelemetry-instrumentation-sqlalchemy # optuna # pytest + # transformers pandas==2.3.3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791 \ --hash=sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac \ @@ -2876,6 +2895,8 @@ pydantic==2.12.5 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or # mcp # nemo-anonymizer # nemo-auditor-plugin + # nemo-automodel-plugin + # nemo-customizer-plugin # nemo-evaluator-plugin # nemo-evaluator-sdk # nemo-platform-ext @@ -2885,6 +2906,7 @@ pydantic==2.12.5 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or # nemoguardrails # nemoplatform # nmp-auth + # nmp-automodel # nmp-common # nmp-customizer # nmp-entities @@ -2979,9 +3001,11 @@ pydantic-settings==2.8.1 ; (platform_machine == 'arm64' and sys_platform == 'dar # fastapi # langchain-community # mcp + # nemo-automodel-plugin # nemo-platform-plugin # nemo-safe-synthesizer # nmp-auth + # nmp-automodel # nmp-common # nmp-customizer # nmp-entities @@ -3157,6 +3181,7 @@ pyyaml==6.0.3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (p # nvidia-nat-core # optuna # sqlfluff + # transformers # uvicorn # watchdog ragas==0.3.5 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ @@ -3223,6 +3248,7 @@ regex==2025.7.34 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or # sacrebleu # sqlfluff # tiktoken + # transformers requests==2.33.1 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517 \ --hash=sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a @@ -3472,6 +3498,25 @@ sacrebleu==2.6.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or # via # nemo-evaluator-sdk # nemoplatform +safetensors==0.8.0rc0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:2b40d25911c5f241cad874ad1ea4100a9a9e3c2d469a73a38b47af759d239f44 \ + --hash=sha256:2ef8ab6704ea895cb13c89d5825f49e87328cac2093e7e45fb3cb615bd457fb2 \ + --hash=sha256:35bf158d1555df7a529c844ae8ab89355c9df34546de0f94c47d538902bcc07c \ + --hash=sha256:50c56d7b6a2f44c3f4ab130bfeb6a8a51ce72bec152805f9c5a46bdf6addb6c5 \ + --hash=sha256:79547625fa84f3a9b28b933e44c67d012edf22a0c7170ed68835b9f467dda836 \ + --hash=sha256:7e7cc49c69d8df5aaaf332532cd636609727599f81294bf4e5de56a2e3b70a10 \ + --hash=sha256:94d9c0d569a124fe3074b9934031c2cdcfab12d4d7b64ae17343fac4a92081e8 \ + --hash=sha256:98b0f6f2a14a6bde7f6acaa5f0381baef9a87c6a3124338affe4e4bb40bf826b \ + --hash=sha256:a132d3cf5f63c3f02b82c4abf65c58d33a8422199ae34e09a9a7edb661bd2ca9 \ + --hash=sha256:b4168a839ff287dc26b0d843e1760962b2e92ed5645f95e8ab3f4b9401807e6a \ + --hash=sha256:b4fcccda047df747e2463744428cba352d99527c4e52545d07f8c3a8583136f1 \ + --hash=sha256:b7f8180f8c119dce85da7913904ccf4a0227adf095eb63f1732a6729c2672cb1 \ + --hash=sha256:bf0d366f75f63867f1ede90f87090450c7cec320da1fc2a597f9bb8cb73460db \ + --hash=sha256:c052d1706567487bc103088fe02daf05132dbccbbc3d798753541b66eb37fb14 \ + --hash=sha256:c1e7a6a1c0dd0128888bc47aca0a9625855673f44f275bf4073088563bf7121b \ + --hash=sha256:d42f6c44773901ce1a021d2372747a559e9ec5aa59d044c0d711c273bff21c67 \ + --hash=sha256:d6532e381c492f5a6b4e82706b232f003e9e697b77d6c2eb7e806d11b578d00b + # via transformers scikit-network==0.33.5 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:2408d3f4c81256a3193d536aad4a6ffcfbb05d096abe6a9cc0b6b5e275df876d \ --hash=sha256:2866b16aed9ef25ba42cb2f2e44ef2ad079337f336ce48d0604b55fa4af87688 \ @@ -3627,9 +3672,9 @@ sqlalchemy==2.0.48 ; (platform_machine == 'arm64' and sys_platform == 'darwin') # nmp-intake # optuna # sqlmodel -sqlfluff==3.4.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ - --hash=sha256:115e3f1bf1dc1318c58426ba3299eb682642cb67b5d12d9ea7c42b5e23aeabd6 \ - --hash=sha256:6e1ea2d39b20cc791a1a009c234afaf043b448c7f2eb1c11551316fb41f36f47 +sqlfluff==4.1.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:83dd4c081afb48c0af861833015a18b13d52726bfe52a286246dbd7a64b7d111 \ + --hash=sha256:ae11123ca4a697abadbd2783f85f04e58c36e7dd26ae8024f400efccc6a44631 # via data-designer-engine sqlmodel==0.0.37 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:2137a4045ef3fd66a917a7717ada959a1ceb3630d95e1f6aaab39dd2c0aef278 \ @@ -3698,6 +3743,7 @@ tenacity==9.1.4 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or # instructor # langchain-community # langchain-core + # nmp-automodel # nmp-models tiktoken==0.12.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:01d99484dc93b129cd0964f9d34eee953f2737301f18b3c7257bf368d7615baa \ @@ -3756,6 +3802,7 @@ tokenizers==0.22.2 ; (platform_machine == 'arm64' and sys_platform == 'darwin') # fastembed # langchain-huggingface # litellm + # transformers tomlkit==0.14.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680 \ --hash=sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064 @@ -3787,6 +3834,11 @@ tqdm==4.67.3 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (pl # optuna # ragas # sqlfluff + # transformers +transformers==5.3.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ + --hash=sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557 \ + --hash=sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a + # via nemo-customizer-plugin typer==0.24.1 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e \ --hash=sha256:e39b4732d65fbdcde189ae76cf7cd48aeae72919dea1fdfc16593be016256b45 @@ -3797,12 +3849,15 @@ typer==0.24.1 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (p # huggingface-hub # instructor # nemo-auditor-plugin + # nemo-automodel-plugin + # nemo-customizer-plugin # nemo-evaluator-plugin # nemo-platform-ext # nemo-platform-plugin # nemo-platform-sdk # nemoguardrails # ragas + # transformers types-aioboto3==15.5.0 ; (platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') \ --hash=sha256:5769a1c3df7ca1abedf3656ddf0b970c9b0436d0f88cf4686040b55cd2a02925 \ --hash=sha256:8aed7c9b6fe9b59e6ce74f7a6db7b8a9912a34c8f80ed639fac1fa59d6b20aa1 diff --git a/uv.lock b/uv.lock index f0839fae..2323fc42 100644 --- a/uv.lock +++ b/uv.lock @@ -41,6 +41,8 @@ members = [ "nemo-agents-plugin", "nemo-anonymizer-plugin", "nemo-auditor-plugin", + "nemo-automodel-plugin", + "nemo-customizer-plugin", "nemo-data-designer-plugin", "nemo-evaluator-plugin", "nemo-evaluator-sdk", @@ -54,6 +56,7 @@ members = [ "nemo-switchyard", "nemoplatform", "nmp-auth", + "nmp-automodel", "nmp-build-tools", "nmp-common", "nmp-core-mcp", @@ -4780,6 +4783,88 @@ dev = [ { name = "ruff", specifier = ">=0.11.8" }, ] +[[package]] +name = "nemo-automodel-plugin" +version = "0.1.0" +source = { editable = "plugins/nemo-automodel" } +dependencies = [ + { name = "nemo-platform", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-platform-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nmp-automodel", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pydantic", extra = ["email"], marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pydantic-settings", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "typer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +] + +[package.dev-dependencies] +dev = [ + { name = "fastapi", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "httpx", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pytest", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pytest-asyncio", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "ruff", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +] + +[package.metadata] +requires-dist = [ + { name = "nemo-platform", editable = "packages/nemo_platform" }, + { name = "nemo-platform-plugin", editable = "packages/nemo_platform_plugin" }, + { name = "nmp-automodel", editable = "services/automodel" }, + { name = "pydantic", specifier = ">=2.10.6" }, + { name = "pydantic-settings", specifier = ">=2.6.1" }, + { name = "typer", specifier = ">=0.12.5" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "fastapi", specifier = ">=0.115.0" }, + { name = "httpx", specifier = ">=0.27.0" }, + { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, + { name = "pytest", specifier = ">=8.3.4" }, + { name = "pytest-asyncio", specifier = ">=0.25.3" }, + { name = "ruff", specifier = ">=0.11.8" }, +] + +[[package]] +name = "nemo-customizer-plugin" +version = "0.1.0" +source = { editable = "plugins/nemo-customizer" } +dependencies = [ + { name = "datasets", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-platform", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-platform-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pydantic", extra = ["email"], marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "transformers", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "typer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +] + +[package.dev-dependencies] +dev = [ + { name = "fastapi", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pytest", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pytest-asyncio", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "ruff", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +] + +[package.metadata] +requires-dist = [ + { name = "datasets", specifier = ">=3.3.1" }, + { name = "nemo-platform", editable = "packages/nemo_platform" }, + { name = "nemo-platform-plugin", editable = "packages/nemo_platform_plugin" }, + { name = "pydantic", specifier = ">=2.10.6" }, + { name = "transformers", specifier = ">=4.48.0" }, + { name = "typer", specifier = ">=0.12.5" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "fastapi", specifier = ">=0.115.0" }, + { name = "pytest", specifier = ">=8.3.4" }, + { name = "pytest-asyncio", specifier = ">=0.25.3" }, + { name = "ruff", specifier = ">=0.11.8" }, +] + [[package]] name = "nemo-data-designer-plugin" version = "0.1.0" @@ -6537,6 +6622,8 @@ core-services = [ { name = "nemo-agents-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-anonymizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-auditor-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-automodel-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-data-designer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-evaluator-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-guardrails-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, @@ -6635,6 +6722,8 @@ enabled-plugins = [ { name = "nemo-agents-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-anonymizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-auditor-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-automodel-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-data-designer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-evaluator-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-guardrails-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, @@ -6644,6 +6733,8 @@ functional-services = [ { name = "nemo-agents-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-anonymizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-auditor-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-automodel-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-customizer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-data-designer-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-evaluator-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, { name = "nemo-guardrails-plugin", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, @@ -6739,6 +6830,8 @@ core-services = [ { name = "nemo-agents-plugin", editable = "plugins/nemo-agents" }, { name = "nemo-anonymizer-plugin", editable = "plugins/nemo-anonymizer" }, { name = "nemo-auditor-plugin", editable = "plugins/nemo-auditor" }, + { name = "nemo-automodel-plugin", editable = "plugins/nemo-automodel" }, + { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, { name = "nemo-data-designer-plugin", editable = "plugins/nemo-data-designer" }, { name = "nemo-evaluator-plugin", editable = "plugins/nemo-evaluator" }, { name = "nemo-guardrails-plugin", editable = "plugins/nemo-guardrails" }, @@ -6840,6 +6933,8 @@ enabled-plugins = [ { name = "nemo-agents-plugin", editable = "plugins/nemo-agents" }, { name = "nemo-anonymizer-plugin", editable = "plugins/nemo-anonymizer" }, { name = "nemo-auditor-plugin", editable = "plugins/nemo-auditor" }, + { name = "nemo-automodel-plugin", editable = "plugins/nemo-automodel" }, + { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, { name = "nemo-data-designer-plugin", editable = "plugins/nemo-data-designer" }, { name = "nemo-evaluator-plugin", editable = "plugins/nemo-evaluator" }, { name = "nemo-guardrails-plugin", editable = "plugins/nemo-guardrails" }, @@ -6849,6 +6944,8 @@ functional-services = [ { name = "nemo-agents-plugin", editable = "plugins/nemo-agents" }, { name = "nemo-anonymizer-plugin", editable = "plugins/nemo-anonymizer" }, { name = "nemo-auditor-plugin", editable = "plugins/nemo-auditor" }, + { name = "nemo-automodel-plugin", editable = "plugins/nemo-automodel" }, + { name = "nemo-customizer-plugin", editable = "plugins/nemo-customizer" }, { name = "nemo-data-designer-plugin", editable = "plugins/nemo-data-designer" }, { name = "nemo-evaluator-plugin", editable = "plugins/nemo-evaluator" }, { name = "nemo-guardrails-plugin", editable = "plugins/nemo-guardrails" }, @@ -7011,6 +7108,44 @@ dev = [ { name = "typer", specifier = ">=0.9.0" }, ] +[[package]] +name = "nmp-automodel" +version = "0.1.0" +source = { editable = "services/automodel" } +dependencies = [ + { name = "aiofiles", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "httpx", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "jsonschema", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nemo-platform-sdk", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "nmp-common", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pydantic", extra = ["email"], marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pydantic-settings", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "tenacity", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +] + +[package.optional-dependencies] +dev = [ + { name = "pytest", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pytest-asyncio", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pytest-mock", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiofiles", specifier = ">=24.1.0" }, + { name = "httpx", specifier = ">=0.27.0" }, + { name = "jsonschema", specifier = ">=4.23.0" }, + { name = "nemo-platform-sdk", editable = "sdk/python/nemo-platform" }, + { name = "nmp-common", editable = "packages/nmp_common" }, + { name = "pydantic", specifier = ">=2.10.6" }, + { name = "pydantic-settings", specifier = ">=2.6.1" }, + { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.3.4" }, + { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.25.3" }, + { name = "pytest-mock", marker = "extra == 'dev'", specifier = ">=3.14.0" }, + { name = "tenacity", specifier = ">=8.5.0" }, +] +provides-extras = ["dev"] + [[package]] name = "nmp-build-tools" version = "0.0.0" @@ -10809,6 +10944,30 @@ dev = [ { name = "pytest", specifier = ">=8.3.3" }, ] +[[package]] +name = "safetensors" +version = "0.8.0rc0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/c9/76/70a044292cabc4b591b9b7947aa7d5dd346647acab18532e7e971a02141e/safetensors-0.8.0rc0.tar.gz", hash = "sha256:b4168a839ff287dc26b0d843e1760962b2e92ed5645f95e8ab3f4b9401807e6a", size = 235447, upload-time = "2026-04-14T14:30:42.125Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/c4/8ae3b9b8159babed52fe67698e4095858787dafb3363fa3500c150eef5d5/safetensors-0.8.0rc0-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c1e7a6a1c0dd0128888bc47aca0a9625855673f44f275bf4073088563bf7121b", size = 469331, upload-time = "2026-04-14T14:30:35.024Z" }, + { url = "https://files.pythonhosted.org/packages/7d/28/5322eb9057aeccb8492546a8e7fc070a8490afcca6e658f0a53e2279cca8/safetensors-0.8.0rc0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:c052d1706567487bc103088fe02daf05132dbccbbc3d798753541b66eb37fb14", size = 450714, upload-time = "2026-04-14T14:30:33.884Z" }, + { url = "https://files.pythonhosted.org/packages/85/10/8aedf0becbe6ba019f0be2ab1efbf124d1319d7daaea5f1e3c165670a162/safetensors-0.8.0rc0-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79547625fa84f3a9b28b933e44c67d012edf22a0c7170ed68835b9f467dda836", size = 493726, upload-time = "2026-04-14T14:30:23.641Z" }, + { url = "https://files.pythonhosted.org/packages/b8/de/9a6d5d2b842814ff7a715169054235b6141924350be746b02f7906dd0756/safetensors-0.8.0rc0-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a132d3cf5f63c3f02b82c4abf65c58d33a8422199ae34e09a9a7edb661bd2ca9", size = 502966, upload-time = "2026-04-14T14:30:25.344Z" }, + { url = "https://files.pythonhosted.org/packages/ee/aa/29be34707d27b81b280759f4e52fb38fc6955e2d5e053164b9ab9eabee77/safetensors-0.8.0rc0-cp310-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d42f6c44773901ce1a021d2372747a559e9ec5aa59d044c0d711c273bff21c67", size = 621250, upload-time = "2026-04-14T14:30:26.746Z" }, + { url = "https://files.pythonhosted.org/packages/7d/fa/5b0997ca9cc70c4e6e6ed2afb59506c7065df29bc4771df8f7be61c3bc90/safetensors-0.8.0rc0-cp310-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b40d25911c5f241cad874ad1ea4100a9a9e3c2d469a73a38b47af759d239f44", size = 527309, upload-time = "2026-04-14T14:30:29.722Z" }, + { url = "https://files.pythonhosted.org/packages/25/e0/be46e568cc05530f106ab5dc2faa383ba51533022d735df32db5d550d598/safetensors-0.8.0rc0-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf0d366f75f63867f1ede90f87090450c7cec320da1fc2a597f9bb8cb73460db", size = 509088, upload-time = "2026-04-14T14:30:32.377Z" }, + { url = "https://files.pythonhosted.org/packages/88/5c/497168a26d656fbf39e20470ad8be60d3bb766267792d999061a6e164bb6/safetensors-0.8.0rc0-cp310-abi3-manylinux_2_31_riscv64.whl", hash = "sha256:50c56d7b6a2f44c3f4ab130bfeb6a8a51ce72bec152805f9c5a46bdf6addb6c5", size = 509345, upload-time = "2026-04-14T14:30:28.235Z" }, + { url = "https://files.pythonhosted.org/packages/01/a4/54fbeed1447bba46bf8715cbf0d45c11339deeb66afde9ced01ead9233c9/safetensors-0.8.0rc0-cp310-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:94d9c0d569a124fe3074b9934031c2cdcfab12d4d7b64ae17343fac4a92081e8", size = 543961, upload-time = "2026-04-14T14:30:31.135Z" }, + { url = "https://files.pythonhosted.org/packages/4f/18/af173ce378d316352a5a20fe4b161cf54366519db587cc12b1aa9771be17/safetensors-0.8.0rc0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b4fcccda047df747e2463744428cba352d99527c4e52545d07f8c3a8583136f1", size = 668965, upload-time = "2026-04-14T14:30:36.24Z" }, + { url = "https://files.pythonhosted.org/packages/47/bf/de0c22d52d4006f682dec432d237bce71418c236f12accff6e9d614ec66d/safetensors-0.8.0rc0-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:2ef8ab6704ea895cb13c89d5825f49e87328cac2093e7e45fb3cb615bd457fb2", size = 778061, upload-time = "2026-04-14T14:30:37.522Z" }, + { url = "https://files.pythonhosted.org/packages/6f/f9/bd146043d920cd3fa0b62fd2f548f7b73f0a6212ed960546055bbb11d62a/safetensors-0.8.0rc0-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:35bf158d1555df7a529c844ae8ab89355c9df34546de0f94c47d538902bcc07c", size = 751302, upload-time = "2026-04-14T14:30:39.191Z" }, + { url = "https://files.pythonhosted.org/packages/44/58/448c080cd6c2b46662dd0fe93e3814e9ea7e1f818ddf8c0d13ca75eda47a/safetensors-0.8.0rc0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:98b0f6f2a14a6bde7f6acaa5f0381baef9a87c6a3124338affe4e4bb40bf826b", size = 713576, upload-time = "2026-04-14T14:30:40.49Z" }, + { url = "https://files.pythonhosted.org/packages/55/97/68207a641c30edc7eed692d89cf340e1fe8ba03f91c3643c9a02419d0942/safetensors-0.8.0rc0-cp310-abi3-win32.whl", hash = "sha256:7e7cc49c69d8df5aaaf332532cd636609727599f81294bf4e5de56a2e3b70a10", size = 325782, upload-time = "2026-04-14T14:30:45.907Z" }, + { url = "https://files.pythonhosted.org/packages/b3/0b/c28fd694c98ebfefb764538a2906428aacb51b3bf18e2206723b1ccc6d48/safetensors-0.8.0rc0-cp310-abi3-win_amd64.whl", hash = "sha256:d6532e381c492f5a6b4e82706b232f003e9e697b77d6c2eb7e806d11b578d00b", size = 342453, upload-time = "2026-04-14T14:30:44.668Z" }, + { url = "https://files.pythonhosted.org/packages/51/73/fd944d3417ba04bd0e72682fa1bedc6d99d986a3594fc7910313088cfe88/safetensors-0.8.0rc0-cp310-abi3-win_arm64.whl", hash = "sha256:b7f8180f8c119dce85da7913904ccf4a0227adf095eb63f1732a6729c2672cb1", size = 330970, upload-time = "2026-04-14T14:30:43.451Z" }, +] + [[package]] name = "scikit-learn" version = "1.8.0" @@ -11686,6 +11845,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, ] +[[package]] +name = "transformers" +version = "5.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "numpy", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "packaging", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "pyyaml", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "regex", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "safetensors", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "tokenizers", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "tqdm", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, + { name = "typer", marker = "(platform_machine == 'arm64' and sys_platform == 'darwin') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'darwin' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'darwin' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-12-nemoplatform-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128') or (sys_platform == 'linux' and extra == 'extra-12-nemoplatform-cu128' and extra == 'extra-20-nmp-safe-synthesizer-cpu') or (sys_platform == 'linux' and extra == 'extra-20-nmp-safe-synthesizer-cpu' and extra == 'extra-20-nmp-safe-synthesizer-cu128')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/fc/1a/70e830d53ecc96ce69cfa8de38f163712d2b43ac52fbd743f39f56025c31/transformers-5.3.0.tar.gz", hash = "sha256:009555b364029da9e2946d41f1c5de9f15e6b1df46b189b7293f33a161b9c557", size = 8830831, upload-time = "2026-03-04T17:41:46.119Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b8/88/ae8320064e32679a5429a2c9ebbc05c2bf32cefb6e076f9b07f6d685a9b4/transformers-5.3.0-py3-none-any.whl", hash = "sha256:50ac8c89c3c7033444fb3f9f53138096b997ebb70d4b5e50a2e810bf12d3d29a", size = 10661827, upload-time = "2026-03-04T17:41:42.722Z" }, +] + [[package]] name = "trove-classifiers" version = "2026.1.14.14"