NVIDIA · Knight-of-Thunder · Aug 18, 2025 · Aug 20, 2025 · Aug 20, 2025 · Aug 20, 2025
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
diff --git a/build_tools/VERSION.txt b/build_tools/VERSION.txt
@@ -1 +1 @@
-2.7.0.dev0
+2.7.0
diff --git a/examples/jax/encoder/test_single_gpu_encoder.py b/examples/jax/encoder/test_single_gpu_encoder.py
@@ -219,7 +219,9 @@ def train_and_evaluate(args):
     else:
         fp8_recipe = None
 
-    with te.fp8_autocast(enabled=args.use_fp8, fp8_recipe=fp8_recipe):
+    with te.fp8_autocast(
+        enabled=args.use_fp8, fp8_recipe=fp8_recipe, mesh_resource=te.sharding.MeshResource()
+    ):
         encoder = Net(num_embed)
         # We use nn.Embed, thus inputs need to be in int
         inputs = jnp.zeros(input_shape, dtype=jnp.int32)

diff --git a/examples/jax/mnist/test_single_gpu_mnist.py b/examples/jax/mnist/test_single_gpu_mnist.py
@@ -193,7 +193,9 @@ def train_and_evaluate(args):
     else:
         fp8_recipe = None
 
-    with te.fp8_autocast(enabled=args.use_fp8, fp8_recipe=fp8_recipe):
+    with te.fp8_autocast(
+        enabled=args.use_fp8, fp8_recipe=fp8_recipe, mesh_resource=te.sharding.MeshResource()
+    ):
         cnn = Net(args.use_te)
         var_collect = cnn.init(init_rngs, jnp.empty(input_shape, dtype=jnp.bfloat16))
         tx = optax.sgd(args.lr, args.momentum)

diff --git a/qa/L0_cppunittest/test.sh b/qa/L0_cppunittest/test.sh
@@ -17,4 +17,4 @@ cd $TE_PATH/tests/cpp
 cmake -GNinja -Bbuild .
 cmake --build build
 export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS))
-ctest --test-dir build -j$NUM_PARALLEL_JOBS
+ctest --test-dir build -j$NUM_PARALLEL_JOBS -E '(AgGemm|GemmRs|GemmAr)'
diff --git a/qa/L0_pytorch_unittest/test.sh b/qa/L0_pytorch_unittest/test.sh
@@ -23,8 +23,6 @@ set -x
 mkdir -p "$XML_LOG_DIR"
 
 pip3 install pytest==8.2.1 || error_exit "Failed to install pytest"
-pip3 install onnxruntime==1.20.1 || error_exit "Failed to install onnxruntime"
-pip3 install onnxruntime_extensions==0.13.0 || error_exit "Failed to install onnxruntime_extensions"
 
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_sanity.xml $TE_PATH/tests/pytorch/test_sanity.py || test_fail "test_sanity.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_recipe.xml $TE_PATH/tests/pytorch/test_recipe.py || test_fail "test_recipe.py"
@@ -40,7 +38,6 @@ python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_float8_blockwise
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_gqa.xml $TE_PATH/tests/pytorch/test_gqa.py || test_fail "test_gqa.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fused_optimizer.xml $TE_PATH/tests/pytorch/test_fused_optimizer.py || test_fail "test_fused_optimizer.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_multi_tensor.xml $TE_PATH/tests/pytorch/test_multi_tensor.py || test_fail "test_multi_tensor.py"
-python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_onnx_export.xml $TE_PATH/tests/pytorch/test_onnx_export.py || test_fail "test_onnx_export.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_fusible_ops.xml $TE_PATH/tests/pytorch/test_fusible_ops.py || test_fail "test_fusible_ops.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_permutation.xml $TE_PATH/tests/pytorch/test_permutation.py || test_fail "test_permutation.py"
 python3 -m pytest --tb=auto --junitxml=$XML_LOG_DIR/pytest_test_parallel_cross_entropy.xml $TE_PATH/tests/pytorch/test_parallel_cross_entropy.py || test_fail "test_parallel_cross_entropy.py"

diff --git a/qa/L1_cpp_distributed/test.sh b/qa/L1_cpp_distributed/test.sh
@@ -0,0 +1,15 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+set -e
+
+# Find TE
+: ${TE_PATH:=/opt/transformerengine}
+TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
+export LD_LIBRARY_PATH=$TE_LIB_PATH:$LD_LIBRARY_PATH
+
+cd $TE_PATH/tests/cpp
+cmake -GNinja -S. -Bbuild
+cmake --build build
+mpirun --allow-run-as-root --np 4 --oversubscribe ./build/comm_gemm/test_comm_gemm
diff --git a/qa/L1_pytorch_onnx_unittest/test.sh b/qa/L1_pytorch_onnx_unittest/test.sh
@@ -0,0 +1,11 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+
+pip3 install onnxruntime==1.20.1
+pip3 install onnxruntime_extensions==0.13.0
+
+: ${TE_PATH:=/opt/transformerengine}
+
+python3 -m pytest --tb=auto  $TE_PATH/tests/pytorch/test_onnx_export.py
diff --git a/setup.py b/setup.py
@@ -4,6 +4,7 @@
 
 """Installation script."""
 
+from importlib import metadata
 import os
 import time
 from pathlib import Path
@@ -66,6 +67,18 @@ def setup_common_extension() -> CMakeExtension:
     if bool(int(os.getenv("NVTE_BUILD_ACTIVATION_WITH_FAST_MATH", "0"))):
         cmake_flags.append("-DNVTE_BUILD_ACTIVATION_WITH_FAST_MATH=ON")
 
+    if bool(int(os.getenv("NVTE_WITH_CUBLASMP", "0"))):
+        cmake_flags.append("-DNVTE_WITH_CUBLASMP=ON")
+        cublasmp_dir = os.getenv("CUBLASMP_HOME") or metadata.distribution(
+            "nvidia-cublasmp-cu12"
+        ).locate_file("nvidia/cublasmp/cu12")
+        cmake_flags.append(f"-DCUBLASMP_DIR={cublasmp_dir}")
+        nvshmem_dir = os.getenv("NVSHMEM_HOME") or metadata.distribution(
+            "nvidia-nvshmem-cu12"
+        ).locate_file("nvidia/nvshmem")
+        cmake_flags.append(f"-DNVSHMEM_DIR={nvshmem_dir}")
-        cmake_flags.append(f"-DNVSHMEM_DIR={nvshmem_dir}")
-        cmake_flags.append(f"-DNVSHMEM_DIR={nvshmem_dir}")
+        print("CMAKE_FLAGS:", cmake_flags[-2:])
+
     # Add custom CMake arguments from environment variable
     nvte_cmake_extra_args = os.getenv("NVTE_CMAKE_EXTRA_ARGS")
     if nvte_cmake_extra_args:

diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
@@ -37,6 +37,7 @@ find_library(TE_LIB NAMES transformer_engine PATHS "${TE_LIB_PATH}/.." ${TE_LIB_
 message(STATUS "Found transformer_engine library: ${TE_LIB}")
 include_directories(../../transformer_engine/common/include)
 include_directories(../../transformer_engine/common)
+include_directories(../../transformer_engine)
 include_directories(${CMAKE_SOURCE_DIR})
 
 find_package(CUDAToolkit REQUIRED)

diff --git a/tests/cpp/comm_gemm/CMakeLists.txt b/tests/cpp/comm_gemm/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+add_executable(test_comm_gemm
+               test_comm_gemm.cu
+               ../test_common.cu)
+
+find_package(OpenMP REQUIRED)
+find_package(MPI REQUIRED)
+find_library(NCCL_LIB
+             NAMES nccl libnccl
+             PATH_SUFFIXES lib
+             REQUIRED)
+target_include_directories(test_comm_gemm PRIVATE ${MPI_CXX_INCLUDE_PATH} $ENV{CUBLASMP_HOME}/include)
+target_link_libraries(test_comm_gemm PUBLIC CUDA::cuda_driver CUDA::cudart GTest::gtest ${TE_LIB} CUDA::nvrtc CUDNN::cudnn MPI::MPI_CXX ${NCCL_LIB} OpenMP::OpenMP_CXX)
+
+include(GoogleTest)
+gtest_discover_tests(test_comm_gemm DISCOVERY_TIMEOUT 600)
+1 −1		CMakeLists.txt
+2 −2		include/cudnn_frontend/backend/backend_descriptor.h
+16 −16		include/cudnn_frontend/backend/device_properties.h
+8 −8		include/cudnn_frontend/backend/execution_helpers.h
+13 −13		include/cudnn_frontend/backend/kernel_cache.h
+56 −56		include/cudnn_frontend/backend/plan_helpers.h
+2 −2		include/cudnn_frontend/graph_helpers.h
+128 −82		include/cudnn_frontend/graph_interface.h
+124 −146		include/cudnn_frontend/graph_properties.h
+95 −95		include/cudnn_frontend/node/adaptive_layernorm.h
+27 −27		include/cudnn_frontend/node/block_scale_dequantize.h
+27 −27		include/cudnn_frontend/node/block_scale_quantize.h
+21 −21		include/cudnn_frontend/node/concatenate.h
+6 −2		include/cudnn_frontend/node/matmul_fp8.h
+12 −8		include/cudnn_frontend/node/paged_cache_load.h
+277 −341		include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
+19 −2		include/cudnn_frontend/node/sdpa_fp8_bwd.h
+467 −0		include/cudnn_frontend/node/sdpa_support_surface.h
+2 −2		include/cudnn_frontend/node_interface.h
+1 −1		include/cudnn_frontend/plans.h
+10 −31		include/cudnn_frontend/utils/serialize.h
+10 −1		include/cudnn_frontend_utils.h
+1 −1		include/cudnn_frontend_version.h
+9 −2		python/cudnn/__init__.py
+46 −0		python/properties.cpp
+28 −6		python/pygraph/pygraph.cpp
+59 −5		python/pygraph/pygraph.h
+319 −132		python/pygraph/sdpa.cpp
+7 −1		samples/README.md
+1 −1		samples/cpp/matmul/blackwell_nvfp4_mxfp8_block_scale_matmul.cpp
+1 −1		samples/cpp/matmul/general_block_scale_matmul.cpp
+5 −3		samples/cpp/misc/custom_plan.cpp
+5 −5		samples/cpp/misc/deviceless_aot_compilation.cpp
+0 −192		samples/legacy_samples/utils/error_util.h
+583 −0		samples/python/33_layernorm_forward_training_and_backward_with_relu_bitmask.ipynb
+233 −61		samples/python/52_scaled_dot_product_attention_with_paged_caches.ipynb
+486 −0		samples/python/53_scaled_dot_product_attention_decode_with_paged_caches.ipynb
+1 −2		test/cpp/serialize.cpp
+104 −0		test/python/test_deviceless_aot_compilation.py
+4 −4		test/python/test_low_precision_matmul.py