From bc22b0557ccf34efce498f13278dcbd58a603853 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 24 Feb 2026 15:16:04 -0800 Subject: [PATCH 01/23] Add the new CUDA package to the search for version Signed-off-by: Przemek Tredak --- build_tools/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/build_tools/utils.py b/build_tools/utils.py index 885901068a..2c5d441866 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -292,6 +292,13 @@ def cuda_version() -> Tuple[int, ...]: version_str = get_version("nvidia-cuda-runtime-cu12") version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) return version_tuple + except: + pass + + try: + version_str = get_version("nvidia-cuda-runtime") + version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) + return version_tuple except importlib.metadata.PackageNotFoundError: raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") From a758953ec6b15a164558ebb0a6531addd9ffa9f2 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 10:53:46 -0800 Subject: [PATCH 02/23] Move nccl.h behind the cublasmp guard Signed-off-by: Przemek Tredak --- transformer_engine/common/util/logging.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformer_engine/common/util/logging.h b/transformer_engine/common/util/logging.h index 8031e342e2..88fd854c6d 100644 --- a/transformer_engine/common/util/logging.h +++ b/transformer_engine/common/util/logging.h @@ -12,10 +12,9 @@ #include #include -#include "nccl.h" - #ifdef NVTE_WITH_CUBLASMP #include +#include "nccl.h" #endif // NVTE_WITH_CUBLASMP #include From 4c86fb0298e7d87094d7bd028510887a588e483e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 11:50:05 -0800 Subject: [PATCH 03/23] Try pytorch build Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 44 +++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d80564274e..10b92d3fbd 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -39,10 +39,6 @@ jobs: name: 'PyTorch' runs-on: ubuntu-latest steps: - - name: Move /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" - - name: Maximize build space uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 with: @@ -53,37 +49,43 @@ jobs: remove-android: 'true' remove-haskell: 'true' remove-codeql: 'true' - build-mount-path: '/var/lib/docker/' - - - name: Restore /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive + - name: "Disk space check before dependencies" + run: df -lh - - name: Start named container - run: | - docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity - + - name: ccache + uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad - name: 'Dependencies' run: | - docker exec builder bash -c '\ - apt-get update && \ - apt-get install -y git python3.9 pip cudnn9-cuda-12 && \ - pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \ - apt-get clean \ - ' + pip install cmake torch ninja pybind11 numpy + pip install torch --index-url https://download.pytorch.org/whl/cu130 + pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" + ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_HOME/lib/libcudart.so + ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_HOME/lib/libcublas.so + ln -s $CUDA_PATH/lib $CUDA_HOME/lib64 + env: + CUDA_PATH: /usr/local/lib/python3.12/dist-packages/nvidia/cu13 + + - name: "Disk space check after dependencies" + run: df -lh - name: 'Build' - run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps' + run: | + export PATH=$CUDA_PATH/bin:$PATH + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH + export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH + NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v --no-deps env: + CUDA_PATH: /usr/local/lib/python3.12/dist-packages/nvidia/cu13 NVTE_FRAMEWORK: pytorch MAX_JOBS: 1 + SCCACHE_GHA_ENABLED: "true" - name: 'Sanity check' - run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py' + run: python3 tests/pytorch/test_sanity_import.py jax: name: 'JAX' runs-on: ubuntu-latest From b56daa2b0ea2675a75f5b093f4eea0263a460ca2 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 11:53:30 -0800 Subject: [PATCH 04/23] Fix Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 10b92d3fbd..f717516a7c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -39,17 +39,17 @@ jobs: name: 'PyTorch' runs-on: ubuntu-latest steps: - - name: Maximize build space - uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 - with: - root-reserve-mb: 5120 - temp-reserve-mb: 32 - swap-size-mb: 10240 - remove-dotnet: 'true' - remove-android: 'true' - remove-haskell: 'true' - remove-codeql: 'true' - + # - name: Maximize build space + # uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 + # with: + # root-reserve-mb: 5120 + # temp-reserve-mb: 32 + # swap-size-mb: 10240 + # remove-dotnet: 'true' + # remove-android: 'true' + # remove-haskell: 'true' + # remove-codeql: 'true' + # - name: 'Checkout' uses: actions/checkout@v3 with: From a59b6abc944739a6472b840fd8e370976e97a37e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 11:55:22 -0800 Subject: [PATCH 05/23] Fix Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f717516a7c..593f779f4f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -61,7 +61,7 @@ jobs: uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad - name: 'Dependencies' run: | - pip install cmake torch ninja pybind11 numpy + pip install cmake ninja pybind11 numpy pip install torch --index-url https://download.pytorch.org/whl/cu130 pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_HOME/lib/libcudart.so From 32fafa4514462c26a42dd817fe1a3d4eb429719e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Mar 2026 20:00:58 +0000 Subject: [PATCH 06/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/common/util/logging.h | 1 + 1 file changed, 1 insertion(+) diff --git a/transformer_engine/common/util/logging.h b/transformer_engine/common/util/logging.h index 88fd854c6d..d85046dcbd 100644 --- a/transformer_engine/common/util/logging.h +++ b/transformer_engine/common/util/logging.h @@ -14,6 +14,7 @@ #ifdef NVTE_WITH_CUBLASMP #include + #include "nccl.h" #endif // NVTE_WITH_CUBLASMP From cd7bf8ab0e731772750266fb4f86eef0a2f775fd Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 12:02:12 -0800 Subject: [PATCH 07/23] Maybe fix? Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 593f779f4f..c70297eae6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -64,6 +64,7 @@ jobs: pip install cmake ninja pybind11 numpy pip install torch --index-url https://download.pytorch.org/whl/cu130 pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" + export CUDA_PATH=/usr/local/lib/python3.12/dist-packages/nvidia/cu13 ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_HOME/lib/libcudart.so ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_HOME/lib/libcublas.so ln -s $CUDA_PATH/lib $CUDA_HOME/lib64 @@ -75,6 +76,7 @@ jobs: - name: 'Build' run: | + export CUDA_PATH=/usr/local/lib/python3.12/dist-packages/nvidia/cu13 export PATH=$CUDA_PATH/bin:$PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH From 7e636cc40a7a4674abecbd6c48d33d815f06169c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 13:00:05 -0800 Subject: [PATCH 08/23] Fix Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c70297eae6..dcc1d8fdb3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -65,9 +65,9 @@ jobs: pip install torch --index-url https://download.pytorch.org/whl/cu130 pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" export CUDA_PATH=/usr/local/lib/python3.12/dist-packages/nvidia/cu13 - ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_HOME/lib/libcudart.so - ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_HOME/lib/libcublas.so - ln -s $CUDA_PATH/lib $CUDA_HOME/lib64 + ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so + ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so + ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 env: CUDA_PATH: /usr/local/lib/python3.12/dist-packages/nvidia/cu13 From f033c8b07d72025185c0906f09a8c343cbb0a8f9 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 14:03:18 -0800 Subject: [PATCH 09/23] Test Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dcc1d8fdb3..4045bf9dbe 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -64,7 +64,8 @@ jobs: pip install cmake ninja pybind11 numpy pip install torch --index-url https://download.pytorch.org/whl/cu130 pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" - export CUDA_PATH=/usr/local/lib/python3.12/dist-packages/nvidia/cu13 + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + echo $CUDA_PATH ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 From df606be10be6d2f816672a9aa97d5b023edf717f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 14:05:48 -0800 Subject: [PATCH 10/23] Fix Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4045bf9dbe..5da531f557 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -77,7 +77,7 @@ jobs: - name: 'Build' run: | - export CUDA_PATH=/usr/local/lib/python3.12/dist-packages/nvidia/cu13 + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") export PATH=$CUDA_PATH/bin:$PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH From edbc26585a7fdc71f4cf5582763891d9ee714adf Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 14:17:31 -0800 Subject: [PATCH 11/23] Fix for cuDNN Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5da531f557..826aee97bf 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -69,8 +69,6 @@ jobs: ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 - env: - CUDA_PATH: /usr/local/lib/python3.12/dist-packages/nvidia/cu13 - name: "Disk space check after dependencies" run: df -lh @@ -78,12 +76,12 @@ jobs: - name: 'Build' run: | export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')") export PATH=$CUDA_PATH/bin:$PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v --no-deps env: - CUDA_PATH: /usr/local/lib/python3.12/dist-packages/nvidia/cu13 NVTE_FRAMEWORK: pytorch MAX_JOBS: 1 SCCACHE_GHA_ENABLED: "true" From 2e66d4f82126022fc09295cbd85e10430bb2884b Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 15:29:23 -0800 Subject: [PATCH 12/23] Trying without sccache Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 826aee97bf..f6a1f0e0e2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -57,8 +57,6 @@ jobs: - name: "Disk space check before dependencies" run: df -lh - - name: ccache - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad - name: 'Dependencies' run: | pip install cmake ninja pybind11 numpy From b373a86488be3672e511ee600759f26008b9413e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 2 Mar 2026 16:04:00 -0800 Subject: [PATCH 13/23] Fix Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f6a1f0e0e2..0887891b9c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -78,7 +78,7 @@ jobs: export PATH=$CUDA_PATH/bin:$PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH - NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v --no-deps + pip install --no-build-isolation . -v --no-deps env: NVTE_FRAMEWORK: pytorch MAX_JOBS: 1 From 8a48eb776a2a92843a689e1fa5c723e761d5022e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 3 Mar 2026 10:10:25 -0800 Subject: [PATCH 14/23] Maybe inside a clean container? Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0887891b9c..a90f439c55 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -38,6 +38,9 @@ jobs: pytorch: name: 'PyTorch' runs-on: ubuntu-latest + container: + image: ubuntu:24.04 + options: --user root steps: # - name: Maximize build space # uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 @@ -59,6 +62,8 @@ jobs: - name: 'Dependencies' run: | + apt-get update && apt-get install -y python3 python3-pip + export PIP_BREAK_SYSTEM_PACKAGES=1 pip install cmake ninja pybind11 numpy pip install torch --index-url https://download.pytorch.org/whl/cu130 pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" @@ -78,10 +83,12 @@ jobs: export PATH=$CUDA_PATH/bin:$PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH + export NVTE_BUILD_USE_NVIDIA_WHEELS=1 + export PIP_BREAK_SYSTEM_PACKAGES=1 pip install --no-build-isolation . -v --no-deps env: NVTE_FRAMEWORK: pytorch - MAX_JOBS: 1 + MAX_JOBS: 2 SCCACHE_GHA_ENABLED: "true" - name: 'Sanity check' run: python3 tests/pytorch/test_sanity_import.py From 13abe8f05b1b64b8adae07796222f816ea69f845 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 3 Mar 2026 10:12:44 -0800 Subject: [PATCH 15/23] Test Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a90f439c55..d2eedddfd5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -53,6 +53,8 @@ jobs: # remove-haskell: 'true' # remove-codeql: 'true' # + - name: 'Container dependencies' + run: apt-get update && apt-get install -y git python3 python3-pip - name: 'Checkout' uses: actions/checkout@v3 with: @@ -62,7 +64,6 @@ jobs: - name: 'Dependencies' run: | - apt-get update && apt-get install -y python3 python3-pip export PIP_BREAK_SYSTEM_PACKAGES=1 pip install cmake ninja pybind11 numpy pip install torch --index-url https://download.pytorch.org/whl/cu130 From f0714efa288c4b2e0c46ad7052952f188f45c4c4 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 3 Mar 2026 12:14:59 -0800 Subject: [PATCH 16/23] Add packaging missing dependency and ccache Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d2eedddfd5..e11b1e74b1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -54,7 +54,7 @@ jobs: # remove-codeql: 'true' # - name: 'Container dependencies' - run: apt-get update && apt-get install -y git python3 python3-pip + run: apt-get update && apt-get install -y git python3 python3-pip ccache - name: 'Checkout' uses: actions/checkout@v3 with: @@ -65,7 +65,7 @@ jobs: - name: 'Dependencies' run: | export PIP_BREAK_SYSTEM_PACKAGES=1 - pip install cmake ninja pybind11 numpy + pip install cmake ninja pybind11 numpy packaging pip install torch --index-url https://download.pytorch.org/whl/cu130 pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") @@ -76,7 +76,13 @@ jobs: - name: "Disk space check after dependencies" run: df -lh - + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}- + - name: "Disk space check after dependencies and ccache" + run: df -lh - name: 'Build' run: | export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") @@ -86,11 +92,12 @@ jobs: export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH export NVTE_BUILD_USE_NVIDIA_WHEELS=1 export PIP_BREAK_SYSTEM_PACKAGES=1 - pip install --no-build-isolation . -v --no-deps + NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v --no-deps env: NVTE_FRAMEWORK: pytorch MAX_JOBS: 2 - SCCACHE_GHA_ENABLED: "true" + CCACHE_DIR: /root/.ccache + CCACHE_MAXSIZE: 2G - name: 'Sanity check' run: python3 tests/pytorch/test_sanity_import.py jax: From 47fa3f1658915a3530a63ad818f3ded34183be61 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 3 Mar 2026 14:08:13 -0800 Subject: [PATCH 17/23] Fix Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e11b1e74b1..e9150aab72 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -92,7 +92,7 @@ jobs: export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH export NVTE_BUILD_USE_NVIDIA_WHEELS=1 export PIP_BREAK_SYSTEM_PACKAGES=1 - NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v --no-deps + NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: pytorch MAX_JOBS: 2 From 5e26ea2193fde627e5e3c0318becc60521c7a078 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 9 Mar 2026 15:46:40 -0700 Subject: [PATCH 18/23] Changed All Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 112 +++++++++--------- .../jax/csrc/extensions/cgemm_helper.h | 2 + 2 files changed, 58 insertions(+), 56 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e9150aab72..c8ea709a35 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,20 +18,22 @@ jobs: - name: 'Dependencies' run: | apt-get update - apt-get install -y git python3.9 pip cudnn9-cuda-12 + apt-get install -y git python3.9 pip cudnn9-cuda-12 ccache pip install cmake==3.21.0 pybind11[global] ninja - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - name: ccache - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-core-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-core- - name: 'Build' - run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v + run: NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: none MAX_JOBS: 1 - SCCACHE_GHA_ENABLED: "true" - name: 'Sanity check' run: python3 -c "import transformer_engine" working-directory: / @@ -42,17 +44,6 @@ jobs: image: ubuntu:24.04 options: --user root steps: - # - name: Maximize build space - # uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 - # with: - # root-reserve-mb: 5120 - # temp-reserve-mb: 32 - # swap-size-mb: 10240 - # remove-dotnet: 'true' - # remove-android: 'true' - # remove-haskell: 'true' - # remove-codeql: 'true' - # - name: 'Container dependencies' run: apt-get update && apt-get install -y git python3 python3-pip ccache - name: 'Checkout' @@ -79,8 +70,8 @@ jobs: - uses: actions/cache@v4 with: path: /root/.ccache - key: ccache-${{ runner.os }}-${{ github.ref }} - restore-keys: ccache-${{ runner.os }}- + key: ccache-${{ runner.os }}-pytorch-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-pytorch- - name: "Disk space check after dependencies and ccache" run: df -lh - name: 'Build' @@ -88,8 +79,6 @@ jobs: export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')") export PATH=$CUDA_PATH/bin:$PATH - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH/lib:$LD_LIBRARY_PATH - export LIBRARY_PATH=$LIBRARY_PATH/lib:$LIBRARY_PATH export NVTE_BUILD_USE_NVIDIA_WHEELS=1 export PIP_BREAK_SYSTEM_PACKAGES=1 NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v @@ -113,60 +102,71 @@ jobs: uses: actions/checkout@v3 with: submodules: recursive - - name: ccache - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-jax-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-jax- - name: 'Build' run: | - NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v + NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: jax MAX_JOBS: 1 - SCCACHE_GHA_ENABLED: "true" - name: 'Sanity check' run: python3 tests/jax/test_sanity_import.py all: name: 'All' runs-on: ubuntu-latest + container: + image: ubuntu:24.04 + options: --user root steps: - - name: Move /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" - - - name: Maximize build space - uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 - with: - root-reserve-mb: 5120 - temp-reserve-mb: 32 - swap-size-mb: 10240 - remove-dotnet: 'true' - remove-android: 'true' - remove-haskell: 'true' - remove-codeql: 'true' - build-mount-path: '/var/lib/docker/' - - - name: Restore /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" - + - name: 'Container dependencies' + run: apt-get update && apt-get install -y git python3 python3-pip ccache - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - - name: Start named container - run: | - docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity + - name: "Disk space check before dependencies" + run: df -lh - name: 'Dependencies' run: | - docker exec builder bash -c '\ - pip install cmake==3.21.0 pybind11[global] einops onnxscript && \ - pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130 - ' + export PIP_BREAK_SYSTEM_PACKAGES=1 + pip install cmake ninja pybind11 numpy packaging + pip install torch --index-url https://download.pytorch.org/whl/cu130 + pip install jax[cuda13] + pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + echo $CUDA_PATH + ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so + ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so + ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 + + - name: "Disk space check after dependencies" + run: df -lh + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-all-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-all- + - name: "Disk space check after dependencies and ccache" + run: df -lh - name: 'Build' - run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps' + run: | + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')") + export PATH=$CUDA_PATH/bin:$PATH + export NVTE_BUILD_USE_NVIDIA_WHEELS=1 + export PIP_BREAK_SYSTEM_PACKAGES=1 + NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: all - MAX_JOBS: 1 - - name: 'Sanity check' - run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py' + MAX_JOBS: 2 + CCACHE_DIR: /root/.ccache + CCACHE_MAXSIZE: 2G + - name: 'Sanity check (pytorch)' + run: python3 tests/pytorch/test_sanity_import.py + - name: 'Sanity check (jax)' + run: python3 tests/jax/test_sanity_import.py diff --git a/transformer_engine/jax/csrc/extensions/cgemm_helper.h b/transformer_engine/jax/csrc/extensions/cgemm_helper.h index 2b980e7ee4..1de3f85f31 100644 --- a/transformer_engine/jax/csrc/extensions/cgemm_helper.h +++ b/transformer_engine/jax/csrc/extensions/cgemm_helper.h @@ -17,6 +17,8 @@ #include #include +#include + #include "../extensions.h" #include "common/comm_gemm_overlap/userbuffers/userbuffers.h" #include "common/util/cuda_runtime.h" From 68a3c70a3109354ee206f3d405843ffcf565f800 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 22:49:28 +0000 Subject: [PATCH 19/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- transformer_engine/jax/csrc/extensions/cgemm_helper.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformer_engine/jax/csrc/extensions/cgemm_helper.h b/transformer_engine/jax/csrc/extensions/cgemm_helper.h index 1de3f85f31..edab003c2a 100644 --- a/transformer_engine/jax/csrc/extensions/cgemm_helper.h +++ b/transformer_engine/jax/csrc/extensions/cgemm_helper.h @@ -7,6 +7,7 @@ #ifndef TRANSFORMER_ENGINE_JAX_CGEMM_HELPER_H_ #define TRANSFORMER_ENGINE_JAX_CGEMM_HELPER_H_ +#include #include #include @@ -17,8 +18,6 @@ #include #include -#include - #include "../extensions.h" #include "common/comm_gemm_overlap/userbuffers/userbuffers.h" #include "common/util/cuda_runtime.h" From 8cd53db9d37e75f3921457bebcaab02847677ff8 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 9 Mar 2026 16:28:28 -0700 Subject: [PATCH 20/23] Add ccache to jax build Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c8ea709a35..ca76bb79bc 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -97,7 +97,9 @@ jobs: options: --user root steps: - name: 'Dependencies' - run: pip install cmake==3.21.0 pybind11[global] + run: | + pip install cmake==3.21.0 pybind11[global] + apt-get update && apt-get install -y ccache - name: 'Checkout' uses: actions/checkout@v3 with: From d2504035062c9a84020e1071413c92d372cc8966 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 9 Mar 2026 16:43:51 -0700 Subject: [PATCH 21/23] Fixes Signed-off-by: Przemek Tredak --- .github/workflows/build.yml | 2 ++ build_tools/jax.py | 7 ++++++- build_tools/utils.py | 27 +++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ca76bb79bc..69a095a496 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -63,6 +63,7 @@ jobs: echo $CUDA_PATH ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so + ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 - name: "Disk space check after dependencies" @@ -144,6 +145,7 @@ jobs: echo $CUDA_PATH ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so + ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 - name: "Disk space check after dependencies" diff --git a/build_tools/jax.py b/build_tools/jax.py index f07c0a202f..60359ce906 100644 --- a/build_tools/jax.py +++ b/build_tools/jax.py @@ -9,7 +9,8 @@ import setuptools -from .utils import get_cuda_include_dirs, all_files_in_dir, debug_build_enabled +from .utils import (get_cuda_include_dirs, all_files_in_dir, + debug_build_enabled, get_cuda_library_dirs) from typing import List @@ -92,6 +93,9 @@ def setup_jax_extension( ] ) + # Library dirs + library_dirs = get_cuda_library_dirs() + # Compile flags cxx_flags = ["-O3"] if debug_build_enabled(): @@ -109,4 +113,5 @@ def setup_jax_extension( include_dirs=[str(path) for path in include_dirs], extra_compile_args=cxx_flags, libraries=["nccl"], + library_dirs=library_dirs, ) diff --git a/build_tools/utils.py b/build_tools/utils.py index 2c5d441866..7cf384ed75 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -250,6 +250,33 @@ def get_cuda_include_dirs() -> Tuple[str, str]: if subdir.is_dir() and (subdir / "include").is_dir() ] +@functools.lru_cache(maxsize=None) +def get_cuda_library_dirs() -> Tuple[str, str]: + """Returns the CUDA library directory.""" + + force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0"))) + # If cuda is installed via toolkit, all necessary headers + # are bundled inside the top level cuda directory. + if not force_wheels and cuda_toolkit_include_path() is not None: + return [] + + # Use pip wheels to include all headers. + try: + import nvidia + except ModuleNotFoundError as e: + raise RuntimeError("CUDA not found.") + + if nvidia.__file__ is not None: + cuda_root = Path(nvidia.__file__).parent + else: + cuda_root = Path(nvidia.__path__[0]) # namespace + return [ + subdir / "lib" + for subdir in cuda_root.iterdir() + if subdir.is_dir() and (subdir / "lib").is_dir() + ] + + @functools.lru_cache(maxsize=None) def cuda_archs() -> str: From 27218dae3e8cf5553daec9496209844429b57bcb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:47:08 +0000 Subject: [PATCH 22/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- build_tools/jax.py | 8 ++++++-- build_tools/utils.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/build_tools/jax.py b/build_tools/jax.py index 60359ce906..42bbada4bf 100644 --- a/build_tools/jax.py +++ b/build_tools/jax.py @@ -9,8 +9,12 @@ import setuptools -from .utils import (get_cuda_include_dirs, all_files_in_dir, - debug_build_enabled, get_cuda_library_dirs) +from .utils import ( + get_cuda_include_dirs, + all_files_in_dir, + debug_build_enabled, + get_cuda_library_dirs, +) from typing import List diff --git a/build_tools/utils.py b/build_tools/utils.py index 7cf384ed75..a078a3ea00 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -250,6 +250,7 @@ def get_cuda_include_dirs() -> Tuple[str, str]: if subdir.is_dir() and (subdir / "include").is_dir() ] + @functools.lru_cache(maxsize=None) def get_cuda_library_dirs() -> Tuple[str, str]: """Returns the CUDA library directory.""" @@ -277,7 +278,6 @@ def get_cuda_library_dirs() -> Tuple[str, str]: ] - @functools.lru_cache(maxsize=None) def cuda_archs() -> str: archs = os.getenv("NVTE_CUDA_ARCHS") From 3949f121fb30df4dccad2795a9c6f3c03352aa7d Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 9 Mar 2026 16:47:47 -0700 Subject: [PATCH 23/23] Fix Signed-off-by: Przemek Tredak --- build_tools/jax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/jax.py b/build_tools/jax.py index 42bbada4bf..1ffad16ebc 100644 --- a/build_tools/jax.py +++ b/build_tools/jax.py @@ -117,5 +117,5 @@ def setup_jax_extension( include_dirs=[str(path) for path in include_dirs], extra_compile_args=cxx_flags, libraries=["nccl"], - library_dirs=library_dirs, + library_dirs=[str(path) for path in library_dirs], )