diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0f05dbc40a..aa4b54aff8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -22,72 +22,78 @@ jobs: - name: 'Dependencies' run: | apt-get update - apt-get install -y git python3.9 pip cudnn9-cuda-12 + apt-get install -y git python3.9 pip cudnn9-cuda-12 ccache pip install cmake==3.21.0 pybind11[global] ninja - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - name: ccache - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-core-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-core- - name: 'Build' - run: NVTE_USE_CCACHE=1 NVTE_CCACHE_BIN=sccache pip install --no-build-isolation . -v + run: NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: none MAX_JOBS: 1 - SCCACHE_GHA_ENABLED: "true" - name: 'Sanity check' run: python3 -c "import transformer_engine" working-directory: / pytorch: name: 'PyTorch' runs-on: ubuntu-latest + container: + image: ubuntu:24.04 + options: --user root steps: - - name: Move /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" - - - name: Maximize build space - uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 - with: - root-reserve-mb: 5120 - temp-reserve-mb: 32 - swap-size-mb: 10240 - remove-dotnet: 'true' - remove-android: 'true' - remove-haskell: 'true' - remove-codeql: 'true' - build-mount-path: '/var/lib/docker/' - - - name: Restore /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" - + - name: 'Container dependencies' + run: apt-get update && apt-get install -y git python3 python3-pip ccache - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - - name: Start named container - run: | - docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d nvcr.io/nvidia/cuda:12.8.0-devel-ubuntu22.04 sleep infinity + - name: "Disk space check before dependencies" + run: df -lh - name: 'Dependencies' run: | - docker exec builder bash -c '\ - apt-get update && \ - apt-get install -y git python3.9 pip cudnn9-cuda-12 && \ - pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript && \ - apt-get clean \ - ' + export PIP_BREAK_SYSTEM_PACKAGES=1 + pip install cmake ninja pybind11 numpy packaging + pip install torch --index-url https://download.pytorch.org/whl/cu130 + pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + echo $CUDA_PATH + ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so + ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so + ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so + ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 + - name: "Disk space check after dependencies" + run: df -lh + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-pytorch-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-pytorch- + - name: "Disk space check after dependencies and ccache" + run: df -lh - name: 'Build' - run: docker exec builder bash -c 'pip install --no-build-isolation . -v --no-deps' + run: | + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')") + export PATH=$CUDA_PATH/bin:$PATH + export NVTE_BUILD_USE_NVIDIA_WHEELS=1 + export PIP_BREAK_SYSTEM_PACKAGES=1 + NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: pytorch - MAX_JOBS: 1 + MAX_JOBS: 2 + CCACHE_DIR: /root/.ccache + CCACHE_MAXSIZE: 2G - name: 'Sanity check' - run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py' + run: python3 tests/pytorch/test_sanity_import.py jax: name: 'JAX' runs-on: ubuntu-latest @@ -96,65 +102,79 @@ jobs: options: --user root steps: - name: 'Dependencies' - run: pip install cmake==3.21.0 pybind11[global] + run: | + pip install cmake==3.21.0 pybind11[global] + apt-get update && apt-get install -y ccache - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - name: ccache - uses: mozilla-actions/sccache-action@7d986dd989559c6ecdb630a3fd2557667be217ad + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-jax-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-jax- - name: 'Build' run: | - NVTE_CCACHE_BIN=sccache NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v + NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: jax MAX_JOBS: 1 - SCCACHE_GHA_ENABLED: "true" - name: 'Sanity check' run: python3 tests/jax/test_sanity_import.py all: name: 'All' runs-on: ubuntu-latest + container: + image: ubuntu:24.04 + options: --user root steps: - - name: Move /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo mv /var/lib/docker/ "${GITHUB_WORKSPACE}/docker" - - - name: Maximize build space - uses: easimon/maximize-build-space@c28619d8999a147d5e09c1199f84ff6af6ad5794 - with: - root-reserve-mb: 5120 - temp-reserve-mb: 32 - swap-size-mb: 10240 - remove-dotnet: 'true' - remove-android: 'true' - remove-haskell: 'true' - remove-codeql: 'true' - build-mount-path: '/var/lib/docker/' - - - name: Restore /var/lib/docker/ - shell: bash -euxo pipefail {0} - run: sudo sh -c "mv ${GITHUB_WORKSPACE}/docker/* /var/lib/docker" - + - name: 'Container dependencies' + run: apt-get update && apt-get install -y git python3 python3-pip ccache - name: 'Checkout' uses: actions/checkout@v3 with: submodules: recursive - - - name: Start named container - run: | - docker run -v $(pwd):$(pwd) -w $(pwd) --name builder -d ghcr.io/nvidia/jax:jax sleep infinity + - name: "Disk space check before dependencies" + run: df -lh - name: 'Dependencies' run: | - docker exec builder bash -c '\ - pip install cmake==3.21.0 pybind11[global] einops onnxscript && \ - pip install torch --no-cache-dir --index-url https://download.pytorch.org/whl/cu130 - ' + export PIP_BREAK_SYSTEM_PACKAGES=1 + pip install cmake ninja pybind11 numpy packaging + pip install torch --index-url https://download.pytorch.org/whl/cu130 + pip install jax[cuda13] + pip install "nvidia-cuda-nvcc<13.1" "nvidia-cuda-cccl<13.1" "nvidia-cuda-crt<13.1" "nvidia-nvvm<13.1" "nvidia-cuda-profiler-api<13.1" "nvidia-nvml-dev<13.1" + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + echo $CUDA_PATH + ln -s $CUDA_PATH/lib/libcudart.so.13 $CUDA_PATH/lib/libcudart.so + ln -s $CUDA_PATH/lib/libcublas.so.13 $CUDA_PATH/lib/libcublas.so + ln -s $CUDA_PATH/../nccl/lib/libnccl.so.2 $CUDA_PATH/../nccl/lib/libnccl.so + ln -s $CUDA_PATH/lib $CUDA_PATH/lib64 + + - name: "Disk space check after dependencies" + run: df -lh + - uses: actions/cache@v4 + with: + path: /root/.ccache + key: ccache-${{ runner.os }}-all-${{ github.ref }} + restore-keys: ccache-${{ runner.os }}-all- + - name: "Disk space check after dependencies and ccache" + run: df -lh - name: 'Build' - run: docker exec builder bash -c 'pip install --no-cache-dir --no-build-isolation . -v --no-deps' + run: | + export CUDA_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cu13')") + export CUDNN_PATH=$(python3 -c "import nvidia; print(list(nvidia.__path__)[0] + '/cudnn')") + export PATH=$CUDA_PATH/bin:$PATH + export NVTE_BUILD_USE_NVIDIA_WHEELS=1 + export PIP_BREAK_SYSTEM_PACKAGES=1 + NVTE_USE_CCACHE=1 pip install --no-build-isolation . -v env: NVTE_FRAMEWORK: all - MAX_JOBS: 1 - - name: 'Sanity check' - run: docker exec builder bash -c 'python3 tests/pytorch/test_sanity_import.py && python3 tests/jax/test_sanity_import.py' + MAX_JOBS: 2 + CCACHE_DIR: /root/.ccache + CCACHE_MAXSIZE: 2G + - name: 'Sanity check (pytorch)' + run: python3 tests/pytorch/test_sanity_import.py + - name: 'Sanity check (jax)' + run: python3 tests/jax/test_sanity_import.py diff --git a/build_tools/jax.py b/build_tools/jax.py index f07c0a202f..1ffad16ebc 100644 --- a/build_tools/jax.py +++ b/build_tools/jax.py @@ -9,7 +9,12 @@ import setuptools -from .utils import get_cuda_include_dirs, all_files_in_dir, debug_build_enabled +from .utils import ( + get_cuda_include_dirs, + all_files_in_dir, + debug_build_enabled, + get_cuda_library_dirs, +) from typing import List @@ -92,6 +97,9 @@ def setup_jax_extension( ] ) + # Library dirs + library_dirs = get_cuda_library_dirs() + # Compile flags cxx_flags = ["-O3"] if debug_build_enabled(): @@ -109,4 +117,5 @@ def setup_jax_extension( include_dirs=[str(path) for path in include_dirs], extra_compile_args=cxx_flags, libraries=["nccl"], + library_dirs=[str(path) for path in library_dirs], ) diff --git a/build_tools/utils.py b/build_tools/utils.py index 885901068a..a078a3ea00 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -251,6 +251,33 @@ def get_cuda_include_dirs() -> Tuple[str, str]: ] +@functools.lru_cache(maxsize=None) +def get_cuda_library_dirs() -> Tuple[str, str]: + """Returns the CUDA library directory.""" + + force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0"))) + # If cuda is installed via toolkit, all necessary headers + # are bundled inside the top level cuda directory. + if not force_wheels and cuda_toolkit_include_path() is not None: + return [] + + # Use pip wheels to include all headers. + try: + import nvidia + except ModuleNotFoundError as e: + raise RuntimeError("CUDA not found.") + + if nvidia.__file__ is not None: + cuda_root = Path(nvidia.__file__).parent + else: + cuda_root = Path(nvidia.__path__[0]) # namespace + return [ + subdir / "lib" + for subdir in cuda_root.iterdir() + if subdir.is_dir() and (subdir / "lib").is_dir() + ] + + @functools.lru_cache(maxsize=None) def cuda_archs() -> str: archs = os.getenv("NVTE_CUDA_ARCHS") @@ -292,6 +319,13 @@ def cuda_version() -> Tuple[int, ...]: version_str = get_version("nvidia-cuda-runtime-cu12") version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) return version_tuple + except: + pass + + try: + version_str = get_version("nvidia-cuda-runtime") + version_tuple = tuple(int(part) for part in version_str.split(".") if part.isdigit()) + return version_tuple except importlib.metadata.PackageNotFoundError: raise RuntimeError("Could neither find NVCC executable nor CUDA runtime Python package.") diff --git a/transformer_engine/common/util/logging.h b/transformer_engine/common/util/logging.h index 8031e342e2..d85046dcbd 100644 --- a/transformer_engine/common/util/logging.h +++ b/transformer_engine/common/util/logging.h @@ -12,10 +12,10 @@ #include #include -#include "nccl.h" - #ifdef NVTE_WITH_CUBLASMP #include + +#include "nccl.h" #endif // NVTE_WITH_CUBLASMP #include diff --git a/transformer_engine/jax/csrc/extensions/cgemm_helper.h b/transformer_engine/jax/csrc/extensions/cgemm_helper.h index 2b980e7ee4..edab003c2a 100644 --- a/transformer_engine/jax/csrc/extensions/cgemm_helper.h +++ b/transformer_engine/jax/csrc/extensions/cgemm_helper.h @@ -7,6 +7,7 @@ #ifndef TRANSFORMER_ENGINE_JAX_CGEMM_HELPER_H_ #define TRANSFORMER_ENGINE_JAX_CGEMM_HELPER_H_ +#include #include #include