NVIDIA-NeMo · ko3n1g · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/.gitignore b/.gitignore
@@ -183,3 +183,4 @@ _version.py
 
 # NeMo Run
 .nemo_run/
+local/
diff --git a/docs/guides/execution.md b/docs/guides/execution.md
@@ -53,6 +53,7 @@ The packager support matrix is described below:
 | SkypilotExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
 | DGXCloudExecutor | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
 | LeptonExecutor   | run.Packager, run.GitArchivePackager, run.PatternPackager, run.HybridPackager |
+| KubeflowExecutor | run.Packager |
 
 `run.Packager` is a passthrough base packager.
 
@@ -293,6 +294,53 @@ def your_dgx_executor(nodes: int, gpus_per_node: int, container_image: str):
 
 For a complete end-to-end example using DGX Cloud with NeMo, refer to the [NVIDIA DGX Cloud NeMo End-to-End Workflow Example](https://docs.nvidia.com/dgx-cloud/run-ai/latest/nemo-e2e-example.html).
 
+#### KubeflowExecutor
+
+The `KubeflowExecutor` integrates with the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator) to run distributed training jobs on any Kubernetes cluster. It submits CRDs directly via the Kubernetes API — no `kubectl` required.
+
+Two job kinds are supported via the `job_kind` parameter:
+
+- **`"PyTorchJob"`** (default) — Training Operator v1 (`kubeflow.org/v1`)
+- **`"TrainJob"`** — Training Operator v2 (`trainer.kubeflow.org/v1alpha1`)
+
+Kubernetes configuration is loaded automatically: local kubeconfig is tried first, falling back to in-cluster config when running inside a pod.
+
+Here's an example configuration:
+
+```python
+# PyTorchJob (default)
+executor = run.KubeflowExecutor(
+    namespace="runai-nemo-ci",
+    image="nvcr.io/nvidia/nemo:26.02",
+    num_nodes=3,          # total pods: 1 Master + (num_nodes-1) Workers
+    gpus_per_node=8,      # also sets nproc_per_node unless overridden explicitly
+    cpu_requests="16",
+    memory_requests="64Gi",
+    volumes=[
+        {"name": "model-cache", "persistentVolumeClaim": {"claimName": "data-pvc"}}
+    ],
+    volume_mounts=[{"name": "model-cache", "mountPath": "/nemo-workspace"}],
+    labels={"app": "nemo-ci-training"},
+    env_vars={"NCCL_DEBUG": "INFO"},
+)
+
+# TrainJob (Training Operator v2)
+executor = run.KubeflowExecutor(
+    job_kind="TrainJob",
+    runtime_ref="torch-distributed",  # name of the ClusterTrainingRuntime
+    namespace="runai-nemo-ci",
+    image="nvcr.io/nvidia/nemo:26.02",
+    num_nodes=3,
+    gpus_per_node=8,
+)
+```
+
+`cancel(wait=True)` polls until both the CR and all associated pods are fully terminated before returning.
+
+##### Limitations
+
+Attributes like `resourceClaims` are not [supported](https://github.com/kubeflow/trainer/issues/3264) and must be injected in different ways, like by Mutating Webhooks.
+
 #### LeptonExecutor
 
 The `LeptonExecutor` integrates with an NVIDIA DGX Cloud Lepton cluster's Python SDK to launch distributed jobs. It uses API calls behind the Lepton SDK to authenticate, identify the target node group and resource shapes, and submit the job specification which will be launched as a batch job on the cluster.

diff --git a/nemo_run/__init__.py b/nemo_run/__init__.py
@@ -24,6 +24,7 @@
 from nemo_run.core.execution.base import Executor, ExecutorMacros, import_executor
 from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
 from nemo_run.core.execution.docker import DockerExecutor
+from nemo_run.core.execution.kubeflow import KubeflowExecutor
 from nemo_run.core.execution.launcher import FaultTolerance, SlurmRay, SlurmTemplate, Torchrun
 from nemo_run.core.execution.lepton import LeptonExecutor
 from nemo_run.core.execution.local import LocalExecutor
@@ -66,6 +67,7 @@
     "Packager",
     "Partial",
     "Plugin",
+    "KubeflowExecutor",
     "run",
     "Script",
     "SkypilotExecutor",

diff --git a/nemo_run/config.py b/nemo_run/config.py
@@ -495,6 +495,8 @@ def to_command(
                     )
                 with open(filename, "w") as f:
                     f.write("#!/usr/bin/bash\n" + inline_content)
+                # chmod with minimal +x permissions
+                os.chmod(filename, os.stat(filename).st_mode | 0o755)
 
                 if is_local:
                     cmd = [filename]

diff --git a/nemo_run/core/execution/__init__.py b/nemo_run/core/execution/__init__.py
@@ -16,6 +16,7 @@
 from nemo_run.core.execution.dgxcloud import DGXCloudExecutor
 from nemo_run.core.execution.lepton import LeptonExecutor
 from nemo_run.core.execution.local import LocalExecutor
+from nemo_run.core.execution.kubeflow import KubeflowExecutor
 from nemo_run.core.execution.skypilot import SkypilotExecutor
 from nemo_run.core.execution.slurm import SlurmExecutor
 
@@ -25,4 +26,5 @@
     "SkypilotExecutor",
     "DGXCloudExecutor",
     "LeptonExecutor",
+    "KubeflowExecutor",
 ]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -183,3 +183,4 @@ _version.py

		# NeMo Run
		.nemo_run/
		local/