diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py index b0040708b6..fa5204d919 100644 --- a/cuda_core/examples/jit_lto_fractal.py +++ b/cuda_core/examples/jit_lto_fractal.py @@ -294,8 +294,8 @@ def main(): axs.set_title(title) axs.axis("off") plt.show() + print("done!") if __name__ == "__main__": main() - print("done!") diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py index 123b1f6a11..b3348d4820 100644 --- a/cuda_core/examples/memory_ops.py +++ b/cuda_core/examples/memory_ops.py @@ -25,111 +25,115 @@ launch, ) -if np.__version__ < "2.1.0": - print("This example requires NumPy 2.1.0 or later", file=sys.stderr) - sys.exit(0) - -# Kernel for memory operations -code = """ -extern "C" -__global__ void memory_ops(float* device_data, - float* pinned_data, - size_t N) { - const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < N) { - // Access device memory - device_data[tid] = device_data[tid] + 1.0f; - - // Access pinned memory (zero-copy from GPU) - pinned_data[tid] = pinned_data[tid] * 3.0f; + +def main(): + if np.__version__ < "2.1.0": + print("This example requires NumPy 2.1.0 or later", file=sys.stderr) + sys.exit(0) + + # Kernel for memory operations + code = """ + extern "C" + __global__ void memory_ops(float* device_data, float* pinned_data, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < N) { + // Access device memory + device_data[tid] = device_data[tid] + 1.0f; + + // Access pinned memory (zero-copy from GPU) + pinned_data[tid] = pinned_data[tid] * 3.0f; + } } -} -""" - -dev = Device() -dev.set_current() -stream = dev.create_stream() -# tell CuPy to use our stream as the current stream: -cp.cuda.ExternalStream(int(stream.handle)).use() - -# Compile kernel -program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") -prog = Program(code, code_type="c++", options=program_options) -mod = prog.compile("cubin") -kernel = mod.get_kernel("memory_ops") - -# Create different memory resources -device_mr = dev.memory_resource -pinned_mr = LegacyPinnedMemoryResource() - -# Allocate different types of memory -size = 1024 -dtype = cp.float32 -element_size = dtype().itemsize -total_size = size * element_size - -# 1. Device Memory (GPU-only) -device_buffer = device_mr.allocate(total_size, stream=stream) -device_array = cp.from_dlpack(device_buffer).view(dtype=dtype) - -# 2. Pinned Memory (CPU memory, GPU accessible) -pinned_buffer = pinned_mr.allocate(total_size, stream=stream) -pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype) - -# Initialize data -rng = cp.random.default_rng() -device_array[:] = rng.random(size, dtype=dtype) -pinned_array[:] = rng.random(size, dtype=dtype).get() - -# Store original values for verification -device_original = device_array.copy() -pinned_original = pinned_array.copy() - -# Sync before kernel launch -stream.sync() - -# Launch kernel -block = 256 -grid = (size + block - 1) // block -config = LaunchConfig(grid=grid, block=block) - -launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size)) -stream.sync() - -# Verify kernel operations -assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed" -assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed" - -# Copy data between different memory types -print("\nCopying data between memory types...") - -# Copy from device to pinned memory -device_buffer.copy_to(pinned_buffer, stream=stream) -stream.sync() - -# Verify the copy operation -assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed" - -# Create a new device buffer and copy from pinned -new_device_buffer = device_mr.allocate(total_size, stream=stream) -new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype) - -pinned_buffer.copy_to(new_device_buffer, stream=stream) -stream.sync() - -# Verify the copy operation -assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed" - -# Clean up -device_buffer.close(stream) -pinned_buffer.close(stream) -new_device_buffer.close(stream) -stream.close() -cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream - -# Verify buffers are properly closed -assert device_buffer.handle == 0, "Device buffer should be closed" -assert pinned_buffer.handle == 0, "Pinned buffer should be closed" -assert new_device_buffer.handle == 0, "New device buffer should be closed" - -print("Memory management example completed!") + """ + + dev = Device() + dev.set_current() + stream = dev.create_stream() + # tell CuPy to use our stream as the current stream: + cp.cuda.ExternalStream(int(stream.handle)).use() + + # Compile kernel + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin") + kernel = mod.get_kernel("memory_ops") + + # Create different memory resources + device_mr = dev.memory_resource + pinned_mr = LegacyPinnedMemoryResource() + + # Allocate different types of memory + size = 1024 + dtype = cp.float32 + element_size = dtype().itemsize + total_size = size * element_size + + # 1. Device Memory (GPU-only) + device_buffer = device_mr.allocate(total_size, stream=stream) + device_array = cp.from_dlpack(device_buffer).view(dtype=dtype) + + # 2. Pinned Memory (CPU memory, GPU accessible) + pinned_buffer = pinned_mr.allocate(total_size, stream=stream) + pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype) + + # Initialize data + rng = cp.random.default_rng() + device_array[:] = rng.random(size, dtype=dtype) + pinned_array[:] = rng.random(size, dtype=dtype).get() + + # Store original values for verification + device_original = device_array.copy() + pinned_original = pinned_array.copy() + + # Sync before kernel launch + stream.sync() + + # Launch kernel + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) + + launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size)) + stream.sync() + + # Verify kernel operations + assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed" + assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed" + + # Copy data between different memory types + print("\nCopying data between memory types...") + + # Copy from device to pinned memory + device_buffer.copy_to(pinned_buffer, stream=stream) + stream.sync() + + # Verify the copy operation + assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed" + + # Create a new device buffer and copy from pinned + new_device_buffer = device_mr.allocate(total_size, stream=stream) + new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype) + + pinned_buffer.copy_to(new_device_buffer, stream=stream) + stream.sync() + + # Verify the copy operation + assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed" + + # Clean up + device_buffer.close(stream) + pinned_buffer.close(stream) + new_device_buffer.close(stream) + stream.close() + cp.cuda.Stream.null.use() # reset CuPy's current stream to the null stream + + # Verify buffers are properly closed + assert device_buffer.handle == 0, "Device buffer should be closed" + assert pinned_buffer.handle == 0, "Pinned buffer should be closed" + assert new_device_buffer.handle == 0, "New device buffer should be closed" + + print("Memory management example completed!") + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index 433d63c9eb..a05c52ad5b 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 @@ -17,96 +17,100 @@ import torch from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch -# SAXPY kernel - passing a as a pointer to avoid any type issues -code = """ -template -__global__ void saxpy_kernel(const T* a, const T* x, const T* y, T* out, size_t N) { - const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; - if (tid < N) { - // Dereference a to get the scalar value - out[tid] = (*a) * x[tid] + y[tid]; - } -} -""" - -dev = Device() -dev.set_current() - -# Get PyTorch's current stream -pt_stream = torch.cuda.current_stream() -print(f"PyTorch stream: {pt_stream}") - - -# Create a wrapper class that implements __cuda_stream__ -class PyTorchStreamWrapper: - def __init__(self, pt_stream): - self.pt_stream = pt_stream - - def __cuda_stream__(self): - stream_id = self.pt_stream.cuda_stream - return (0, stream_id) # Return format required by CUDA Python - - -s = dev.create_stream(PyTorchStreamWrapper(pt_stream)) - -# prepare program -program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") -prog = Program(code, code_type="c++", options=program_options) -mod = prog.compile( - "cubin", - logs=sys.stdout, - name_expressions=("saxpy_kernel", "saxpy_kernel"), -) - -# Run in single precision -ker = mod.get_kernel("saxpy_kernel") -dtype = torch.float32 - -# prepare input/output -size = 64 -# Use a single element tensor for 'a' -a = torch.tensor([10.0], dtype=dtype, device="cuda") -x = torch.rand(size, dtype=dtype, device="cuda") -y = torch.rand(size, dtype=dtype, device="cuda") -out = torch.empty_like(x) - -# prepare launch -block = 32 -grid = int((size + block - 1) // block) -config = LaunchConfig(grid=grid, block=block) -ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) - -# launch kernel on our stream -launch(s, config, ker, *ker_args) - -# check result -assert torch.allclose(out, a.item() * x + y) -print("Single precision test passed!") - -# let's repeat again with double precision -ker = mod.get_kernel("saxpy_kernel") -dtype = torch.float64 - -# prepare input -size = 128 -# Use a single element tensor for 'a' -a = torch.tensor([42.0], dtype=dtype, device="cuda") -x = torch.rand(size, dtype=dtype, device="cuda") -y = torch.rand(size, dtype=dtype, device="cuda") - -# prepare output -out = torch.empty_like(x) - -# prepare launch -block = 64 -grid = int((size + block - 1) // block) -config = LaunchConfig(grid=grid, block=block) -ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) - -# launch kernel on PyTorch's stream -launch(s, config, ker, *ker_args) - -# check result -assert torch.allclose(out, a * x + y) -print("Double precision test passed!") -print("All tests passed successfully!") + +def main(): + # SAXPY kernel - passing a as a pointer to avoid any type issues + code = """ + template + __global__ void saxpy_kernel(const T* a, const T* x, const T* y, T* out, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < N) { + // Dereference a to get the scalar value + out[tid] = (*a) * x[tid] + y[tid]; + } + } + """ + + dev = Device() + dev.set_current() + + # Get PyTorch's current stream + pt_stream = torch.cuda.current_stream() + print(f"PyTorch stream: {pt_stream}") + + # Create a wrapper class that implements __cuda_stream__ + class PyTorchStreamWrapper: + def __init__(self, pt_stream): + self.pt_stream = pt_stream + + def __cuda_stream__(self): + stream_id = self.pt_stream.cuda_stream + return (0, stream_id) # Return format required by CUDA Python + + s = dev.create_stream(PyTorchStreamWrapper(pt_stream)) + + # prepare program + program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile( + "cubin", + logs=sys.stdout, + name_expressions=("saxpy_kernel", "saxpy_kernel"), + ) + + # Run in single precision + ker = mod.get_kernel("saxpy_kernel") + dtype = torch.float32 + + # prepare input/output + size = 64 + # Use a single element tensor for 'a' + a = torch.tensor([10.0], dtype=dtype, device="cuda") + x = torch.rand(size, dtype=dtype, device="cuda") + y = torch.rand(size, dtype=dtype, device="cuda") + out = torch.empty_like(x) + + # prepare launch + block = 32 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + + # launch kernel on our stream + launch(s, config, ker, *ker_args) + + # check result + assert torch.allclose(out, a.item() * x + y) + print("Single precision test passed!") + + # let's repeat again with double precision + ker = mod.get_kernel("saxpy_kernel") + dtype = torch.float64 + + # prepare input + size = 128 + # Use a single element tensor for 'a' + a = torch.tensor([42.0], dtype=dtype, device="cuda") + x = torch.rand(size, dtype=dtype, device="cuda") + y = torch.rand(size, dtype=dtype, device="cuda") + + # prepare output + out = torch.empty_like(x) + + # prepare launch + block = 64 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size) + + # launch kernel on PyTorch's stream + launch(s, config, ker, *ker_args) + + # check result + assert torch.allclose(out, a * x + y) + print("Double precision test passed!") + print("All tests passed successfully!") + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index aa0d77eff9..b25e23ce37 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -16,103 +16,104 @@ import cupy as cp from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch -# compute out = a * x + y -code = """ -template -__global__ void saxpy(const T a, - const T* x, - const T* y, - T* out, - size_t N) { - const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (size_t i=tid; i + __global__ void saxpy(const T a, const T* x, const T* y, T* out, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i=tid; i", "saxpy"), -) - -# run in single precision -ker = mod.get_kernel("saxpy") -dtype = cp.float32 - -# prepare input/output -size = cp.uint64(64) -a = dtype(10) -rng = cp.random.default_rng() -x = rng.random(size, dtype=dtype) -y = rng.random(size, dtype=dtype) -out = cp.empty_like(x) -dev.sync() # cupy runs on a different stream from s, so sync before accessing - -# prepare launch -block = 32 -grid = int((size + block - 1) // block) -config = LaunchConfig(grid=grid, block=block) -ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) - -# launch kernel on stream s -launch(s, config, ker, *ker_args) -s.sync() - -# check result -assert cp.allclose(out, a * x + y) - -# let's repeat again, this time allocates our own out buffer instead of cupy's -# run in double precision -ker = mod.get_kernel("saxpy") -dtype = cp.float64 - -# prepare input -size = cp.uint64(128) -a = dtype(42) -x = rng.random(size, dtype=dtype) -y = rng.random(size, dtype=dtype) -dev.sync() - -# prepare output -buf = dev.allocate( - size * 8, # = dtype.itemsize - stream=s, -) - -# prepare launch -block = 64 -grid = int((size + block - 1) // block) -config = LaunchConfig(grid=grid, block=block) -ker_args = (a, x.data.ptr, y.data.ptr, buf, size) - -# launch kernel on stream s -launch(s, config, ker, *ker_args) -s.sync() - -# check result -# we wrap output buffer as a cupy array for simplicity -out = cp.ndarray( - size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0) -) -assert cp.allclose(out, a * x + y) - -# clean up resources that we allocate -# cupy cleans up automatically the rest -buf.close(s) -s.close() - -print("done!") + """ + + dev = Device() + dev.set_current() + s = dev.create_stream() + + # prepare program + program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + + # Note the use of the `name_expressions` argument to specify the template + # instantiations of the kernel that we will use. For non-templated kernels, + # `name_expressions` will simply contain the name of the kernels. + mod = prog.compile( + "cubin", + logs=sys.stdout, + name_expressions=("saxpy", "saxpy"), + ) + + # run in single precision + ker = mod.get_kernel("saxpy") + dtype = cp.float32 + + # prepare input/output + size = cp.uint64(64) + a = dtype(10) + rng = cp.random.default_rng() + x = rng.random(size, dtype=dtype) + y = rng.random(size, dtype=dtype) + out = cp.empty_like(x) + dev.sync() # cupy runs on a different stream from s, so sync before accessing + + # prepare launch + block = 32 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size) + + # launch kernel on stream s + launch(s, config, ker, *ker_args) + s.sync() + + # check result + assert cp.allclose(out, a * x + y) + + # let's repeat again, this time allocates our own out buffer instead of cupy's + # run in double precision + ker = mod.get_kernel("saxpy") + dtype = cp.float64 + + # prepare input + size = cp.uint64(128) + a = dtype(42) + x = rng.random(size, dtype=dtype) + y = rng.random(size, dtype=dtype) + dev.sync() + + # prepare output + buf = dev.allocate( + size * 8, # = dtype.itemsize + stream=s, + ) + + # prepare launch + block = 64 + grid = int((size + block - 1) // block) + config = LaunchConfig(grid=grid, block=block) + ker_args = (a, x.data.ptr, y.data.ptr, buf, size) + + # launch kernel on stream s + launch(s, config, ker, *ker_args) + s.sync() + + # check result + # we wrap output buffer as a cupy array for simplicity + out = cp.ndarray( + size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0) + ) + assert cp.allclose(out, a * x + y) + + # clean up resources that we allocate + # cupy cleans up automatically the rest + buf.close(s) + s.close() + + print("done!") + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py index 8b14cf0767..bed6399141 100644 --- a/cuda_core/examples/show_device_properties.py +++ b/cuda_core/examples/show_device_properties.py @@ -214,7 +214,7 @@ def print_device_properties(properties): # Print info about all CUDA devices in the system -def show_device_properties(): +def main(): ndev = system.get_num_devices() print(f"Number of GPUs: {ndev}") @@ -238,4 +238,4 @@ def show_device_properties(): if __name__ == "__main__": assert len(sys.argv) == 1, "no command-line arguments expected" - show_device_properties() + main() diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 497a4309cf..83380b19cb 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -14,114 +14,112 @@ import cupy as cp from cuda.core import Device, LaunchConfig, Program, launch, system -if system.get_num_devices() < 2: - print("this example requires at least 2 GPUs", file=sys.stderr) - sys.exit(0) - -dtype = cp.float32 -size = 50000 - -# Set GPU 0 -dev0 = Device(0) -dev0.set_current() -stream0 = dev0.create_stream() - -# Compile a kernel targeting GPU 0 to compute c = a + b -code_add = """ -extern "C" -__global__ void vector_add(const float* A, - const float* B, - float* C, - size_t N) { - const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (size_t i=tid; i - -namespace cg = cooperative_groups; - -extern "C" -__global__ void check_cluster_info(unsigned int* grid_dims, unsigned int* cluster_dims, unsigned int* block_dims) { - auto g = cg::this_grid(); - auto b = cg::this_thread_block(); - - if (g.cluster_rank() == 0 && g.block_rank() == 0 && g.thread_rank() == 0) { - // Store grid dimensions (in blocks) - grid_dims[0] = g.dim_blocks().x; - grid_dims[1] = g.dim_blocks().y; - grid_dims[2] = g.dim_blocks().z; - - // Store cluster dimensions - cluster_dims[0] = g.dim_clusters().x; - cluster_dims[1] = g.dim_clusters().y; - cluster_dims[2] = g.dim_clusters().z; - - // Store block dimensions (in threads) - block_dims[0] = b.dim_threads().x; - block_dims[1] = b.dim_threads().y; - block_dims[2] = b.dim_threads().z; - - // Also print to console - printf("grid dim: (%u, %u, %u)\n", g.dim_blocks().x, g.dim_blocks().y, g.dim_blocks().z); - printf("cluster dim: (%u, %u, %u)\n", g.dim_clusters().x, g.dim_clusters().y, g.dim_clusters().z); - printf("block dim: (%u, %u, %u)\n", b.dim_threads().x, b.dim_threads().y, b.dim_threads().z); +from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, Program, ProgramOptions, launch + + +def main(): + if np.lib.NumpyVersion(np.__version__) < "2.2.5": + print("This example requires NumPy 2.2.5 or later", file=sys.stderr) + sys.exit(0) + + # prepare include + cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) + if cuda_path is None: + print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr) + sys.exit(0) + + cuda_include = Path(cuda_path) / "include" + assert cuda_include.is_dir() + include_path = [cuda_include] + cccl_include = Path(cuda_include) / "cccl" + if cccl_include.is_dir(): + include_path.insert(0, cccl_include) + + # print cluster info using a kernel and store results in pinned memory + code = r""" + #include + + namespace cg = cooperative_groups; + + extern "C" + __global__ void check_cluster_info(unsigned int* grid_dims, unsigned int* cluster_dims, unsigned int* block_dims) { + auto g = cg::this_grid(); + auto b = cg::this_thread_block(); + + if (g.cluster_rank() == 0 && g.block_rank() == 0 && g.thread_rank() == 0) { + // Store grid dimensions (in blocks) + grid_dims[0] = g.dim_blocks().x; + grid_dims[1] = g.dim_blocks().y; + grid_dims[2] = g.dim_blocks().z; + + // Store cluster dimensions + cluster_dims[0] = g.dim_clusters().x; + cluster_dims[1] = g.dim_clusters().y; + cluster_dims[2] = g.dim_clusters().z; + + // Store block dimensions (in threads) + + block_dims[0] = b.dim_threads().x; + block_dims[1] = b.dim_threads().y; + block_dims[2] = b.dim_threads().z; + + // Also print to console + printf("grid dim: (%u, %u, %u)\n", g.dim_blocks().x, g.dim_blocks().y, g.dim_blocks().z); + printf("cluster dim: (%u, %u, %u)\n", g.dim_clusters().x, g.dim_clusters().y, g.dim_clusters().z); + printf("block dim: (%u, %u, %u)\n", b.dim_threads().x, b.dim_threads().y, b.dim_threads().z); + } } -} -""" - -dev = Device() -arch = dev.compute_capability -if arch < (9, 0): - print( - "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)", - file=sys.stderr, + """ + + dev = Device() + arch = dev.compute_capability + if arch < (9, 0): + print( + "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)", + file=sys.stderr, + ) + sys.exit(0) + arch = "".join(f"{i}" for i in arch) + + # prepare program & compile kernel + dev.set_current() + prog = Program( + code, + code_type="c++", + options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path), ) - sys.exit(0) -arch = "".join(f"{i}" for i in arch) - -# prepare program & compile kernel -dev.set_current() -prog = Program( - code, - code_type="c++", - options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path), -) -mod = prog.compile(target_type="cubin") -ker = mod.get_kernel("check_cluster_info") - -# prepare launch config -grid = 4 -cluster = 2 -block = 32 -config = LaunchConfig(grid=grid, cluster=cluster, block=block) - -# allocate pinned memory to store kernel results -pinned_mr = LegacyPinnedMemoryResource() -element_size = np.dtype(np.uint32).itemsize - -# allocate 3 uint32 values each for grid, cluster, and block dimensions -grid_buffer = pinned_mr.allocate(3 * element_size) -cluster_buffer = pinned_mr.allocate(3 * element_size) -block_buffer = pinned_mr.allocate(3 * element_size) - -# create NumPy arrays from the pinned memory -grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32) -cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32) -block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32) - -# initialize arrays to zero -grid_dims[:] = 0 -cluster_dims[:] = 0 -block_dims[:] = 0 - -# launch kernel on the default stream -launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer) -dev.sync() - -# verify results -print("\nResults stored in pinned memory:") -print(f"Grid dimensions (blocks): {tuple(grid_dims)}") -print(f"Cluster dimensions: {tuple(cluster_dims)}") -print(f"Block dimensions (threads): {tuple(block_dims)}") - -# verify that grid conversion worked correctly: -# LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster) -expected_grid_blocks = grid * cluster # 4 * 2 = 8 -actual_grid_blocks = grid_dims[0] - -print("\nVerification:") -print(f"LaunchConfig specified: grid={grid} clusters, cluster={cluster} blocks/cluster") -print(f"Expected total blocks: {expected_grid_blocks}") -print(f"Actual total blocks: {actual_grid_blocks}") - -if actual_grid_blocks == expected_grid_blocks: - print("✓ Grid conversion is correct!") -else: - print("✗ Grid conversion failed!") - sys.exit(1) - -print("done!") + mod = prog.compile(target_type="cubin") + ker = mod.get_kernel("check_cluster_info") + + # prepare launch config + grid = 4 + cluster = 2 + block = 32 + config = LaunchConfig(grid=grid, cluster=cluster, block=block) + + # allocate pinned memory to store kernel results + pinned_mr = LegacyPinnedMemoryResource() + element_size = np.dtype(np.uint32).itemsize + + # allocate 3 uint32 values each for grid, cluster, and block dimensions + grid_buffer = pinned_mr.allocate(3 * element_size) + cluster_buffer = pinned_mr.allocate(3 * element_size) + block_buffer = pinned_mr.allocate(3 * element_size) + + # create NumPy arrays from the pinned memory + grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32) + cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32) + block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32) + + # initialize arrays to zero + grid_dims[:] = 0 + cluster_dims[:] = 0 + block_dims[:] = 0 + + # launch kernel on the default stream + launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer) + dev.sync() + + # verify results + print("\nResults stored in pinned memory:") + print(f"Grid dimensions (blocks): {tuple(grid_dims)}") + print(f"Cluster dimensions: {tuple(cluster_dims)}") + print(f"Block dimensions (threads): {tuple(block_dims)}") + + # verify that grid conversion worked correctly: + # LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster) + expected_grid_blocks = grid * cluster # 4 * 2 = 8 + actual_grid_blocks = grid_dims[0] + + print("\nVerification:") + print(f"LaunchConfig specified: grid={grid} clusters, cluster={cluster} blocks/cluster") + print(f"Expected total blocks: {expected_grid_blocks}") + print(f"Actual total blocks: {actual_grid_blocks}") + + if actual_grid_blocks == expected_grid_blocks: + print("✓ Grid conversion is correct!") + else: + print("✗ Grid conversion failed!") + sys.exit(1) + + print("done!") + + +if __name__ == "__main__": + main() diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index d31ab77208..3db719f0a1 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -12,53 +12,55 @@ import cupy as cp from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch -# compute c = a + b -code = """ -template -__global__ void vector_add(const T* A, - const T* B, - T* C, - size_t N) { - const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (size_t i=tid; i + __global__ void vector_add(const T* A, const T* B, T* C, size_t N) { + const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) { + C[i] = A[i] + B[i]; + } } -} -""" + """ + + dev = Device() + dev.set_current() + s = dev.create_stream() + # prepare program + program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") + prog = Program(code, code_type="c++", options=program_options) + mod = prog.compile("cubin", name_expressions=("vector_add",)) -dev = Device() -dev.set_current() -s = dev.create_stream() + # run in single precision + ker = mod.get_kernel("vector_add") + dtype = cp.float32 -# prepare program -program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}") -prog = Program(code, code_type="c++", options=program_options) -mod = prog.compile("cubin", name_expressions=("vector_add",)) + # prepare input/output + size = 50000 + rng = cp.random.default_rng() + a = rng.random(size, dtype=dtype) + b = rng.random(size, dtype=dtype) + c = cp.empty_like(a) -# run in single precision -ker = mod.get_kernel("vector_add") -dtype = cp.float32 + # cupy runs on a different stream from s, so sync before accessing + dev.sync() -# prepare input/output -size = 50000 -rng = cp.random.default_rng() -a = rng.random(size, dtype=dtype) -b = rng.random(size, dtype=dtype) -c = cp.empty_like(a) + # prepare launch + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) -# cupy runs on a different stream from s, so sync before accessing -dev.sync() + # launch kernel on stream s + launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) + s.sync() -# prepare launch -block = 256 -grid = (size + block - 1) // block -config = LaunchConfig(grid=grid, block=block) + # check result + assert cp.allclose(c, a + b) + print("done!") -# launch kernel on stream s -launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size)) -s.sync() -# check result -assert cp.allclose(c, a + b) -print("done!") +if __name__ == "__main__": + main()