diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index b0040708b6..fa5204d919 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -294,8 +294,8 @@ def main():
             axs.set_title(title)
             axs.axis("off")
         plt.show()
+    print("done!")
 
 
 if __name__ == "__main__":
     main()
-    print("done!")
diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py
index 123b1f6a11..b3348d4820 100644
--- a/cuda_core/examples/memory_ops.py
+++ b/cuda_core/examples/memory_ops.py
@@ -25,111 +25,115 @@
     launch,
 )
 
-if np.__version__ < "2.1.0":
-    print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
-    sys.exit(0)
-
-# Kernel for memory operations
-code = """
-extern "C"
-__global__ void memory_ops(float* device_data,
-                          float* pinned_data,
-                          size_t N) {
-    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    if (tid < N) {
-        // Access device memory
-        device_data[tid] = device_data[tid] + 1.0f;
-
-        // Access pinned memory (zero-copy from GPU)
-        pinned_data[tid] = pinned_data[tid] * 3.0f;
+
+def main():
+    if np.__version__ < "2.1.0":
+        print("This example requires NumPy 2.1.0 or later", file=sys.stderr)
+        sys.exit(0)
+
+    # Kernel for memory operations
+    code = """
+    extern "C"
+    __global__ void memory_ops(float* device_data, float* pinned_data, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        if (tid < N) {
+            // Access device memory
+            device_data[tid] = device_data[tid] + 1.0f;
+
+            // Access pinned memory (zero-copy from GPU)
+            pinned_data[tid] = pinned_data[tid] * 3.0f;
+        }
     }
-}
-"""
-
-dev = Device()
-dev.set_current()
-stream = dev.create_stream()
-# tell CuPy to use our stream as the current stream:
-cp.cuda.ExternalStream(int(stream.handle)).use()
-
-# Compile kernel
-program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
-prog = Program(code, code_type="c++", options=program_options)
-mod = prog.compile("cubin")
-kernel = mod.get_kernel("memory_ops")
-
-# Create different memory resources
-device_mr = dev.memory_resource
-pinned_mr = LegacyPinnedMemoryResource()
-
-# Allocate different types of memory
-size = 1024
-dtype = cp.float32
-element_size = dtype().itemsize
-total_size = size * element_size
-
-# 1. Device Memory (GPU-only)
-device_buffer = device_mr.allocate(total_size, stream=stream)
-device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)
-
-# 2. Pinned Memory (CPU memory, GPU accessible)
-pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
-pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)
-
-# Initialize data
-rng = cp.random.default_rng()
-device_array[:] = rng.random(size, dtype=dtype)
-pinned_array[:] = rng.random(size, dtype=dtype).get()
-
-# Store original values for verification
-device_original = device_array.copy()
-pinned_original = pinned_array.copy()
-
-# Sync before kernel launch
-stream.sync()
-
-# Launch kernel
-block = 256
-grid = (size + block - 1) // block
-config = LaunchConfig(grid=grid, block=block)
-
-launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
-stream.sync()
-
-# Verify kernel operations
-assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
-assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"
-
-# Copy data between different memory types
-print("\nCopying data between memory types...")
-
-# Copy from device to pinned memory
-device_buffer.copy_to(pinned_buffer, stream=stream)
-stream.sync()
-
-# Verify the copy operation
-assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"
-
-# Create a new device buffer and copy from pinned
-new_device_buffer = device_mr.allocate(total_size, stream=stream)
-new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)
-
-pinned_buffer.copy_to(new_device_buffer, stream=stream)
-stream.sync()
-
-# Verify the copy operation
-assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"
-
-# Clean up
-device_buffer.close(stream)
-pinned_buffer.close(stream)
-new_device_buffer.close(stream)
-stream.close()
-cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream
-
-# Verify buffers are properly closed
-assert device_buffer.handle == 0, "Device buffer should be closed"
-assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
-assert new_device_buffer.handle == 0, "New device buffer should be closed"
-
-print("Memory management example completed!")
+    """
+
+    dev = Device()
+    dev.set_current()
+    stream = dev.create_stream()
+    # tell CuPy to use our stream as the current stream:
+    cp.cuda.ExternalStream(int(stream.handle)).use()
+
+    # Compile kernel
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile("cubin")
+    kernel = mod.get_kernel("memory_ops")
+
+    # Create different memory resources
+    device_mr = dev.memory_resource
+    pinned_mr = LegacyPinnedMemoryResource()
+
+    # Allocate different types of memory
+    size = 1024
+    dtype = cp.float32
+    element_size = dtype().itemsize
+    total_size = size * element_size
+
+    # 1. Device Memory (GPU-only)
+    device_buffer = device_mr.allocate(total_size, stream=stream)
+    device_array = cp.from_dlpack(device_buffer).view(dtype=dtype)
+
+    # 2. Pinned Memory (CPU memory, GPU accessible)
+    pinned_buffer = pinned_mr.allocate(total_size, stream=stream)
+    pinned_array = np.from_dlpack(pinned_buffer).view(dtype=dtype)
+
+    # Initialize data
+    rng = cp.random.default_rng()
+    device_array[:] = rng.random(size, dtype=dtype)
+    pinned_array[:] = rng.random(size, dtype=dtype).get()
+
+    # Store original values for verification
+    device_original = device_array.copy()
+    pinned_original = pinned_array.copy()
+
+    # Sync before kernel launch
+    stream.sync()
+
+    # Launch kernel
+    block = 256
+    grid = (size + block - 1) // block
+    config = LaunchConfig(grid=grid, block=block)
+
+    launch(stream, config, kernel, device_buffer, pinned_buffer, cp.uint64(size))
+    stream.sync()
+
+    # Verify kernel operations
+    assert cp.allclose(device_array, device_original + 1.0), "Device memory operation failed"
+    assert cp.allclose(pinned_array, pinned_original * 3.0), "Pinned memory operation failed"
+
+    # Copy data between different memory types
+    print("\nCopying data between memory types...")
+
+    # Copy from device to pinned memory
+    device_buffer.copy_to(pinned_buffer, stream=stream)
+    stream.sync()
+
+    # Verify the copy operation
+    assert cp.allclose(pinned_array, device_array), "Device to pinned copy failed"
+
+    # Create a new device buffer and copy from pinned
+    new_device_buffer = device_mr.allocate(total_size, stream=stream)
+    new_device_array = cp.from_dlpack(new_device_buffer).view(dtype=dtype)
+
+    pinned_buffer.copy_to(new_device_buffer, stream=stream)
+    stream.sync()
+
+    # Verify the copy operation
+    assert cp.allclose(new_device_array, pinned_array), "Pinned to device copy failed"
+
+    # Clean up
+    device_buffer.close(stream)
+    pinned_buffer.close(stream)
+    new_device_buffer.close(stream)
+    stream.close()
+    cp.cuda.Stream.null.use()  # reset CuPy's current stream to the null stream
+
+    # Verify buffers are properly closed
+    assert device_buffer.handle == 0, "Device buffer should be closed"
+    assert pinned_buffer.handle == 0, "Pinned buffer should be closed"
+    assert new_device_buffer.handle == 0, "New device buffer should be closed"
+
+    print("Memory management example completed!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index 433d63c9eb..a05c52ad5b 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 
@@ -17,96 +17,100 @@
 import torch
 from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
-# SAXPY kernel - passing a as a pointer to avoid any type issues
-code = """
-template<typename T>
-__global__ void saxpy_kernel(const T* a, const T* x, const T* y, T* out, size_t N) {
- const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
- if (tid < N) {
-   // Dereference a to get the scalar value
-   out[tid] = (*a) * x[tid] + y[tid];
- }
-}
-"""
-
-dev = Device()
-dev.set_current()
-
-# Get PyTorch's current stream
-pt_stream = torch.cuda.current_stream()
-print(f"PyTorch stream: {pt_stream}")
-
-
-# Create a wrapper class that implements __cuda_stream__
-class PyTorchStreamWrapper:
-    def __init__(self, pt_stream):
-        self.pt_stream = pt_stream
-
-    def __cuda_stream__(self):
-        stream_id = self.pt_stream.cuda_stream
-        return (0, stream_id)  # Return format required by CUDA Python
-
-
-s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
-
-# prepare program
-program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-prog = Program(code, code_type="c++", options=program_options)
-mod = prog.compile(
-    "cubin",
-    logs=sys.stdout,
-    name_expressions=("saxpy_kernel<float>", "saxpy_kernel<double>"),
-)
-
-# Run in single precision
-ker = mod.get_kernel("saxpy_kernel<float>")
-dtype = torch.float32
-
-# prepare input/output
-size = 64
-# Use a single element tensor for 'a'
-a = torch.tensor([10.0], dtype=dtype, device="cuda")
-x = torch.rand(size, dtype=dtype, device="cuda")
-y = torch.rand(size, dtype=dtype, device="cuda")
-out = torch.empty_like(x)
-
-# prepare launch
-block = 32
-grid = int((size + block - 1) // block)
-config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
-
-# launch kernel on our stream
-launch(s, config, ker, *ker_args)
-
-# check result
-assert torch.allclose(out, a.item() * x + y)
-print("Single precision test passed!")
-
-# let's repeat again with double precision
-ker = mod.get_kernel("saxpy_kernel<double>")
-dtype = torch.float64
-
-# prepare input
-size = 128
-# Use a single element tensor for 'a'
-a = torch.tensor([42.0], dtype=dtype, device="cuda")
-x = torch.rand(size, dtype=dtype, device="cuda")
-y = torch.rand(size, dtype=dtype, device="cuda")
-
-# prepare output
-out = torch.empty_like(x)
-
-# prepare launch
-block = 64
-grid = int((size + block - 1) // block)
-config = LaunchConfig(grid=grid, block=block)
-ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
-
-# launch kernel on PyTorch's stream
-launch(s, config, ker, *ker_args)
-
-# check result
-assert torch.allclose(out, a * x + y)
-print("Double precision test passed!")
-print("All tests passed successfully!")
+
+def main():
+    # SAXPY kernel - passing a as a pointer to avoid any type issues
+    code = """
+    template<typename T>
+    __global__ void saxpy_kernel(const T* a, const T* x, const T* y, T* out, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        if (tid < N) {
+            // Dereference a to get the scalar value
+            out[tid] = (*a) * x[tid] + y[tid];
+        }
+    }
+    """
+
+    dev = Device()
+    dev.set_current()
+
+    # Get PyTorch's current stream
+    pt_stream = torch.cuda.current_stream()
+    print(f"PyTorch stream: {pt_stream}")
+
+    # Create a wrapper class that implements __cuda_stream__
+    class PyTorchStreamWrapper:
+        def __init__(self, pt_stream):
+            self.pt_stream = pt_stream
+
+        def __cuda_stream__(self):
+            stream_id = self.pt_stream.cuda_stream
+            return (0, stream_id)  # Return format required by CUDA Python
+
+    s = dev.create_stream(PyTorchStreamWrapper(pt_stream))
+
+    # prepare program
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile(
+        "cubin",
+        logs=sys.stdout,
+        name_expressions=("saxpy_kernel<float>", "saxpy_kernel<double>"),
+    )
+
+    # Run in single precision
+    ker = mod.get_kernel("saxpy_kernel<float>")
+    dtype = torch.float32
+
+    # prepare input/output
+    size = 64
+    # Use a single element tensor for 'a'
+    a = torch.tensor([10.0], dtype=dtype, device="cuda")
+    x = torch.rand(size, dtype=dtype, device="cuda")
+    y = torch.rand(size, dtype=dtype, device="cuda")
+    out = torch.empty_like(x)
+
+    # prepare launch
+    block = 32
+    grid = int((size + block - 1) // block)
+    config = LaunchConfig(grid=grid, block=block)
+    ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+
+    # launch kernel on our stream
+    launch(s, config, ker, *ker_args)
+
+    # check result
+    assert torch.allclose(out, a.item() * x + y)
+    print("Single precision test passed!")
+
+    # let's repeat again with double precision
+    ker = mod.get_kernel("saxpy_kernel<double>")
+    dtype = torch.float64
+
+    # prepare input
+    size = 128
+    # Use a single element tensor for 'a'
+    a = torch.tensor([42.0], dtype=dtype, device="cuda")
+    x = torch.rand(size, dtype=dtype, device="cuda")
+    y = torch.rand(size, dtype=dtype, device="cuda")
+
+    # prepare output
+    out = torch.empty_like(x)
+
+    # prepare launch
+    block = 64
+    grid = int((size + block - 1) // block)
+    config = LaunchConfig(grid=grid, block=block)
+    ker_args = (a.data_ptr(), x.data_ptr(), y.data_ptr(), out.data_ptr(), size)
+
+    # launch kernel on PyTorch's stream
+    launch(s, config, ker, *ker_args)
+
+    # check result
+    assert torch.allclose(out, a * x + y)
+    print("Double precision test passed!")
+    print("All tests passed successfully!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index aa0d77eff9..b25e23ce37 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -16,103 +16,104 @@
 import cupy as cp
 from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
-# compute out = a * x + y
-code = """
-template<typename T>
-__global__ void saxpy(const T a,
-                      const T* x,
-                      const T* y,
-                      T* out,
-                      size_t N) {
-    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
-        out[i] = a * x[i] + y[i];
+
+def main():
+    # compute out = a * x + y
+    code = """
+    template<typename T>
+    __global__ void saxpy(const T a, const T* x, const T* y, T* out, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
+            out[i] = a * x[i] + y[i];
+        }
     }
-}
-"""
-
-
-dev = Device()
-dev.set_current()
-s = dev.create_stream()
-
-# prepare program
-program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
-prog = Program(code, code_type="c++", options=program_options)
-
-# Note the use of the `name_expressions` argument to specify the template
-# instantiations of the kernel that we will use. For non-templated kernels,
-# `name_expressions` will simply contain the name of the kernels.
-mod = prog.compile(
-    "cubin",
-    logs=sys.stdout,
-    name_expressions=("saxpy<float>", "saxpy<double>"),
-)
-
-# run in single precision
-ker = mod.get_kernel("saxpy<float>")
-dtype = cp.float32
-
-# prepare input/output
-size = cp.uint64(64)
-a = dtype(10)
-rng = cp.random.default_rng()
-x = rng.random(size, dtype=dtype)
-y = rng.random(size, dtype=dtype)
-out = cp.empty_like(x)
-dev.sync()  # cupy runs on a different stream from s, so sync before accessing
-
-# prepare launch
-block = 32
-grid = int((size + block - 1) // block)
-config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
-
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
-
-# check result
-assert cp.allclose(out, a * x + y)
-
-# let's repeat again, this time allocates our own out buffer instead of cupy's
-# run in double precision
-ker = mod.get_kernel("saxpy<double>")
-dtype = cp.float64
-
-# prepare input
-size = cp.uint64(128)
-a = dtype(42)
-x = rng.random(size, dtype=dtype)
-y = rng.random(size, dtype=dtype)
-dev.sync()
-
-# prepare output
-buf = dev.allocate(
-    size * 8,  # = dtype.itemsize
-    stream=s,
-)
-
-# prepare launch
-block = 64
-grid = int((size + block - 1) // block)
-config = LaunchConfig(grid=grid, block=block)
-ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
-
-# launch kernel on stream s
-launch(s, config, ker, *ker_args)
-s.sync()
-
-# check result
-# we wrap output buffer as a cupy array for simplicity
-out = cp.ndarray(
-    size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)
-)
-assert cp.allclose(out, a * x + y)
-
-# clean up resources that we allocate
-# cupy cleans up automatically the rest
-buf.close(s)
-s.close()
-
-print("done!")
+    """
+
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
+
+    # prepare program
+    program_options = ProgramOptions(std="c++11", arch=f"sm_{dev.arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+
+    # Note the use of the `name_expressions` argument to specify the template
+    # instantiations of the kernel that we will use. For non-templated kernels,
+    # `name_expressions` will simply contain the name of the kernels.
+    mod = prog.compile(
+        "cubin",
+        logs=sys.stdout,
+        name_expressions=("saxpy<float>", "saxpy<double>"),
+    )
+
+    # run in single precision
+    ker = mod.get_kernel("saxpy<float>")
+    dtype = cp.float32
+
+    # prepare input/output
+    size = cp.uint64(64)
+    a = dtype(10)
+    rng = cp.random.default_rng()
+    x = rng.random(size, dtype=dtype)
+    y = rng.random(size, dtype=dtype)
+    out = cp.empty_like(x)
+    dev.sync()  # cupy runs on a different stream from s, so sync before accessing
+
+    # prepare launch
+    block = 32
+    grid = int((size + block - 1) // block)
+    config = LaunchConfig(grid=grid, block=block)
+    ker_args = (a, x.data.ptr, y.data.ptr, out.data.ptr, size)
+
+    # launch kernel on stream s
+    launch(s, config, ker, *ker_args)
+    s.sync()
+
+    # check result
+    assert cp.allclose(out, a * x + y)
+
+    # let's repeat again, this time allocates our own out buffer instead of cupy's
+    # run in double precision
+    ker = mod.get_kernel("saxpy<double>")
+    dtype = cp.float64
+
+    # prepare input
+    size = cp.uint64(128)
+    a = dtype(42)
+    x = rng.random(size, dtype=dtype)
+    y = rng.random(size, dtype=dtype)
+    dev.sync()
+
+    # prepare output
+    buf = dev.allocate(
+        size * 8,  # = dtype.itemsize
+        stream=s,
+    )
+
+    # prepare launch
+    block = 64
+    grid = int((size + block - 1) // block)
+    config = LaunchConfig(grid=grid, block=block)
+    ker_args = (a, x.data.ptr, y.data.ptr, buf, size)
+
+    # launch kernel on stream s
+    launch(s, config, ker, *ker_args)
+    s.sync()
+
+    # check result
+    # we wrap output buffer as a cupy array for simplicity
+    out = cp.ndarray(
+        size, dtype=dtype, memptr=cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(int(buf.handle), buf.size, buf), 0)
+    )
+    assert cp.allclose(out, a * x + y)
+
+    # clean up resources that we allocate
+    # cupy cleans up automatically the rest
+    buf.close(s)
+    s.close()
+
+    print("done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py
index 8b14cf0767..bed6399141 100644
--- a/cuda_core/examples/show_device_properties.py
+++ b/cuda_core/examples/show_device_properties.py
@@ -214,7 +214,7 @@ def print_device_properties(properties):
 
 
 # Print info about all CUDA devices in the system
-def show_device_properties():
+def main():
     ndev = system.get_num_devices()
     print(f"Number of GPUs: {ndev}")
 
@@ -238,4 +238,4 @@ def show_device_properties():
 
 if __name__ == "__main__":
     assert len(sys.argv) == 1, "no command-line arguments expected"
-    show_device_properties()
+    main()
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index 497a4309cf..83380b19cb 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -14,114 +14,112 @@
 import cupy as cp
 from cuda.core import Device, LaunchConfig, Program, launch, system
 
-if system.get_num_devices() < 2:
-    print("this example requires at least 2 GPUs", file=sys.stderr)
-    sys.exit(0)
-
-dtype = cp.float32
-size = 50000
-
-# Set GPU 0
-dev0 = Device(0)
-dev0.set_current()
-stream0 = dev0.create_stream()
-
-# Compile a kernel targeting GPU 0 to compute c = a + b
-code_add = """
-extern "C"
-__global__ void vector_add(const float* A,
-                           const float* B,
-                           float* C,
-                           size_t N) {
-    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
-        C[i] = A[i] + B[i];
+
+def main():
+    if system.get_num_devices() < 2:
+        print("this example requires at least 2 GPUs", file=sys.stderr)
+        sys.exit(0)
+
+    dtype = cp.float32
+    size = 50000
+
+    # Set GPU 0
+    dev0 = Device(0)
+    dev0.set_current()
+    stream0 = dev0.create_stream()
+
+    # Compile a kernel targeting GPU 0 to compute c = a + b
+    code_add = """
+    extern "C"
+    __global__ void vector_add(const float* A, const float* B, float* C, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
+            C[i] = A[i] + B[i];
+        }
     }
-}
-"""
-prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
-mod_add = prog_add.compile("cubin")
-ker_add = mod_add.get_kernel("vector_add")
-
-# Set GPU 1
-dev1 = Device(1)
-dev1.set_current()
-stream1 = dev1.create_stream()
-
-# Compile a kernel targeting GPU 1 to compute c = a - b
-code_sub = """
-extern "C"
-__global__ void vector_sub(const float* A,
-                           const float* B,
-                           float* C,
-                           size_t N) {
-    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
-        C[tid] = A[tid] - B[tid];
+    """
+    prog_add = Program(code_add, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev0.arch}"})
+    mod_add = prog_add.compile("cubin")
+    ker_add = mod_add.get_kernel("vector_add")
+
+    # Set GPU 1
+    dev1 = Device(1)
+    dev1.set_current()
+    stream1 = dev1.create_stream()
+
+    # Compile a kernel targeting GPU 1 to compute c = a - b
+    code_sub = """
+    extern "C"
+    __global__ void vector_sub(const float* A, const float* B, float* C, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
+            C[tid] = A[tid] - B[tid];
+        }
     }
-}
-"""
-prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
-mod_sub = prog_sub.compile("cubin")
-ker_sub = mod_sub.get_kernel("vector_sub")
-
-
-# This adaptor ensures that any foreign stream (ex: from CuPy) that have not
-# yet supported the __cuda_stream__ protocol can still be recognized by
-# cuda.core.
-class StreamAdaptor:
-    def __init__(self, obj):
-        self.obj = obj
-
-    def __cuda_stream__(self):
-        # Note: CuPy streams have a .ptr attribute
-        return (0, self.obj.ptr)
-
-
-# Create launch configs for each kernel that will be executed on the respective
-# CUDA streams.
-block = 256
-grid = (size + block - 1) // block
-config0 = LaunchConfig(grid=grid, block=block)
-config1 = LaunchConfig(grid=grid, block=block)
-
-# Allocate memory on GPU 0
-# Note: This runs on CuPy's current stream for GPU 0
-dev0.set_current()
-rng = cp.random.default_rng()
-a = rng.random(size, dtype=dtype)
-b = rng.random(size, dtype=dtype)
-c = cp.empty_like(a)
-cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
-
-# Establish a stream order to ensure that memory has been initialized before
-# accessed by the kernel.
-stream0.wait(cp_stream0)
-
-# Launch the add kernel on GPU 0 / stream 0
-launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-
-# Allocate memory on GPU 1
-# Note: This runs on CuPy's current stream for GPU 1.
-dev1.set_current()
-rng = cp.random.default_rng()
-x = rng.random(size, dtype=dtype)
-y = rng.random(size, dtype=dtype)
-z = cp.empty_like(a)
-cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
-
-# Establish a stream order
-stream1.wait(cp_stream1)
-
-# Launch the subtract kernel on GPU 1 / stream 1
-launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
-
-# Synchronize both GPUs are validate the results
-dev0.set_current()
-stream0.sync()
-assert cp.allclose(c, a + b)
-dev1.set_current()
-stream1.sync()
-assert cp.allclose(z, x - y)
-
-print("done")
+    """
+    prog_sub = Program(code_sub, code_type="c++", options={"std": "c++17", "arch": f"sm_{dev1.arch}"})
+    mod_sub = prog_sub.compile("cubin")
+    ker_sub = mod_sub.get_kernel("vector_sub")
+
+    # This adaptor ensures that any foreign stream (ex: from CuPy) that have not
+    # yet supported the __cuda_stream__ protocol can still be recognized by
+    # cuda.core.
+    class StreamAdaptor:
+        def __init__(self, obj):
+            self.obj = obj
+
+        def __cuda_stream__(self):
+            # Note: CuPy streams have a .ptr attribute
+            return (0, self.obj.ptr)
+
+    # Create launch configs for each kernel that will be executed on the respective
+    # CUDA streams.
+    block = 256
+    grid = (size + block - 1) // block
+    config0 = LaunchConfig(grid=grid, block=block)
+    config1 = LaunchConfig(grid=grid, block=block)
+
+    # Allocate memory on GPU 0
+    # Note: This runs on CuPy's current stream for GPU 0
+    dev0.set_current()
+    rng = cp.random.default_rng()
+    a = rng.random(size, dtype=dtype)
+    b = rng.random(size, dtype=dtype)
+    c = cp.empty_like(a)
+    cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
+
+    # Establish a stream order to ensure that memory has been initialized before
+    # accessed by the kernel.
+    stream0.wait(cp_stream0)
+
+    # Launch the add kernel on GPU 0 / stream 0
+    launch(stream0, config0, ker_add, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+
+    # Allocate memory on GPU 1
+    # Note: This runs on CuPy's current stream for GPU 1.
+    dev1.set_current()
+    rng = cp.random.default_rng()
+    x = rng.random(size, dtype=dtype)
+    y = rng.random(size, dtype=dtype)
+    z = cp.empty_like(a)
+    cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream()))
+
+    # Establish a stream order
+    stream1.wait(cp_stream1)
+
+    # Launch the subtract kernel on GPU 1 / stream 1
+    launch(stream1, config1, ker_sub, x.data.ptr, y.data.ptr, z.data.ptr, cp.uint64(size))
+
+    # Synchronize both GPUs are validate the results
+    dev0.set_current()
+    stream0.sync()
+    assert cp.allclose(c, a + b)
+    dev1.set_current()
+    stream1.sync()
+    assert cp.allclose(z, x - y)
+
+    print("done")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py
index a20377cc76..37bf15a2af 100644
--- a/cuda_core/examples/strided_memory_view_cpu.py
+++ b/cuda_core/examples/strided_memory_view_cpu.py
@@ -78,7 +78,7 @@ def my_func(arr):
     cpu_func(cpu_prog.cast("int*", view.ptr), size)
 
 
-def run():
+def main():
     global my_func
     if not FFI:
         return
@@ -130,4 +130,4 @@ def run():
 
 
 if __name__ == "__main__":
-    run()
+    main()
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index e91ddc25cc..3232089327 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -83,7 +83,7 @@ def my_func(arr, work_stream, gpu_ker):
     work_stream.sync()
 
 
-def run():
+def main():
     global my_func
     if not cp:
         return None
@@ -123,4 +123,4 @@ def run():
 
 
 if __name__ == "__main__":
-    run()
+    main()
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index f1ea8b8579..3ea4e7ee6b 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -11,137 +11,139 @@
 
 import os
 import sys
+from pathlib import Path
 
 import numpy as np
-from cuda.core import (
-    Device,
-    LaunchConfig,
-    LegacyPinnedMemoryResource,
-    Program,
-    ProgramOptions,
-    launch,
-)
-
-if np.lib.NumpyVersion(np.__version__) < "2.2.5":
-    print("This example requires NumPy 2.2.5 or later", file=sys.stderr)
-    sys.exit(0)
-
-# prepare include
-cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
-if cuda_path is None:
-    print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
-    sys.exit(0)
-cuda_include = os.path.join(cuda_path, "include")
-assert os.path.isdir(cuda_include)
-include_path = [cuda_include]
-cccl_include = os.path.join(cuda_include, "cccl")
-if os.path.isdir(cccl_include):
-    include_path.insert(0, cccl_include)
-
-# print cluster info using a kernel and store results in pinned memory
-code = r"""
-#include <cooperative_groups.h>
-
-namespace cg = cooperative_groups;
-
-extern "C"
-__global__ void check_cluster_info(unsigned int* grid_dims, unsigned int* cluster_dims, unsigned int* block_dims) {
-    auto g = cg::this_grid();
-    auto b = cg::this_thread_block();
-
-    if (g.cluster_rank() == 0 && g.block_rank() == 0 && g.thread_rank() == 0) {
-        // Store grid dimensions (in blocks)
-        grid_dims[0] = g.dim_blocks().x;
-        grid_dims[1] = g.dim_blocks().y;
-        grid_dims[2] = g.dim_blocks().z;
-
-        // Store cluster dimensions
-        cluster_dims[0] = g.dim_clusters().x;
-        cluster_dims[1] = g.dim_clusters().y;
-        cluster_dims[2] = g.dim_clusters().z;
-
-        // Store block dimensions (in threads)
-        block_dims[0] = b.dim_threads().x;
-        block_dims[1] = b.dim_threads().y;
-        block_dims[2] = b.dim_threads().z;
-
-        // Also print to console
-        printf("grid dim: (%u, %u, %u)\n", g.dim_blocks().x, g.dim_blocks().y, g.dim_blocks().z);
-        printf("cluster dim: (%u, %u, %u)\n", g.dim_clusters().x, g.dim_clusters().y, g.dim_clusters().z);
-        printf("block dim: (%u, %u, %u)\n", b.dim_threads().x, b.dim_threads().y, b.dim_threads().z);
+from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, Program, ProgramOptions, launch
+
+
+def main():
+    if np.lib.NumpyVersion(np.__version__) < "2.2.5":
+        print("This example requires NumPy 2.2.5 or later", file=sys.stderr)
+        sys.exit(0)
+
+    # prepare include
+    cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
+    if cuda_path is None:
+        print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
+        sys.exit(0)
+
+    cuda_include = Path(cuda_path) / "include"
+    assert cuda_include.is_dir()
+    include_path = [cuda_include]
+    cccl_include = Path(cuda_include) / "cccl"
+    if cccl_include.is_dir():
+        include_path.insert(0, cccl_include)
+
+    # print cluster info using a kernel and store results in pinned memory
+    code = r"""
+    #include <cooperative_groups.h>
+
+    namespace cg = cooperative_groups;
+
+    extern "C"
+    __global__ void check_cluster_info(unsigned int* grid_dims, unsigned int* cluster_dims, unsigned int* block_dims) {
+        auto g = cg::this_grid();
+        auto b = cg::this_thread_block();
+
+        if (g.cluster_rank() == 0 && g.block_rank() == 0 && g.thread_rank() == 0) {
+            // Store grid dimensions (in blocks)
+            grid_dims[0] = g.dim_blocks().x;
+            grid_dims[1] = g.dim_blocks().y;
+            grid_dims[2] = g.dim_blocks().z;
+
+            // Store cluster dimensions
+            cluster_dims[0] = g.dim_clusters().x;
+            cluster_dims[1] = g.dim_clusters().y;
+            cluster_dims[2] = g.dim_clusters().z;
+
+            // Store block dimensions (in threads)
+
+            block_dims[0] = b.dim_threads().x;
+            block_dims[1] = b.dim_threads().y;
+            block_dims[2] = b.dim_threads().z;
+
+            // Also print to console
+            printf("grid dim: (%u, %u, %u)\n", g.dim_blocks().x, g.dim_blocks().y, g.dim_blocks().z);
+            printf("cluster dim: (%u, %u, %u)\n", g.dim_clusters().x, g.dim_clusters().y, g.dim_clusters().z);
+            printf("block dim: (%u, %u, %u)\n", b.dim_threads().x, b.dim_threads().y, b.dim_threads().z);
+        }
     }
-}
-"""
-
-dev = Device()
-arch = dev.compute_capability
-if arch < (9, 0):
-    print(
-        "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)",
-        file=sys.stderr,
+    """
+
+    dev = Device()
+    arch = dev.compute_capability
+    if arch < (9, 0):
+        print(
+            "this demo requires compute capability >= 9.0 (since thread block cluster is a hardware feature)",
+            file=sys.stderr,
+        )
+        sys.exit(0)
+    arch = "".join(f"{i}" for i in arch)
+
+    # prepare program & compile kernel
+    dev.set_current()
+    prog = Program(
+        code,
+        code_type="c++",
+        options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
     )
-    sys.exit(0)
-arch = "".join(f"{i}" for i in arch)
-
-# prepare program & compile kernel
-dev.set_current()
-prog = Program(
-    code,
-    code_type="c++",
-    options=ProgramOptions(arch=f"sm_{arch}", std="c++17", include_path=include_path),
-)
-mod = prog.compile(target_type="cubin")
-ker = mod.get_kernel("check_cluster_info")
-
-# prepare launch config
-grid = 4
-cluster = 2
-block = 32
-config = LaunchConfig(grid=grid, cluster=cluster, block=block)
-
-# allocate pinned memory to store kernel results
-pinned_mr = LegacyPinnedMemoryResource()
-element_size = np.dtype(np.uint32).itemsize
-
-# allocate 3 uint32 values each for grid, cluster, and block dimensions
-grid_buffer = pinned_mr.allocate(3 * element_size)
-cluster_buffer = pinned_mr.allocate(3 * element_size)
-block_buffer = pinned_mr.allocate(3 * element_size)
-
-# create NumPy arrays from the pinned memory
-grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32)
-cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32)
-block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32)
-
-# initialize arrays to zero
-grid_dims[:] = 0
-cluster_dims[:] = 0
-block_dims[:] = 0
-
-# launch kernel on the default stream
-launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
-dev.sync()
-
-# verify results
-print("\nResults stored in pinned memory:")
-print(f"Grid dimensions (blocks): {tuple(grid_dims)}")
-print(f"Cluster dimensions: {tuple(cluster_dims)}")
-print(f"Block dimensions (threads): {tuple(block_dims)}")
-
-# verify that grid conversion worked correctly:
-# LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster)
-expected_grid_blocks = grid * cluster  # 4 * 2 = 8
-actual_grid_blocks = grid_dims[0]
-
-print("\nVerification:")
-print(f"LaunchConfig specified: grid={grid} clusters, cluster={cluster} blocks/cluster")
-print(f"Expected total blocks: {expected_grid_blocks}")
-print(f"Actual total blocks: {actual_grid_blocks}")
-
-if actual_grid_blocks == expected_grid_blocks:
-    print("✓ Grid conversion is correct!")
-else:
-    print("✗ Grid conversion failed!")
-    sys.exit(1)
-
-print("done!")
+    mod = prog.compile(target_type="cubin")
+    ker = mod.get_kernel("check_cluster_info")
+
+    # prepare launch config
+    grid = 4
+    cluster = 2
+    block = 32
+    config = LaunchConfig(grid=grid, cluster=cluster, block=block)
+
+    # allocate pinned memory to store kernel results
+    pinned_mr = LegacyPinnedMemoryResource()
+    element_size = np.dtype(np.uint32).itemsize
+
+    # allocate 3 uint32 values each for grid, cluster, and block dimensions
+    grid_buffer = pinned_mr.allocate(3 * element_size)
+    cluster_buffer = pinned_mr.allocate(3 * element_size)
+    block_buffer = pinned_mr.allocate(3 * element_size)
+
+    # create NumPy arrays from the pinned memory
+    grid_dims = np.from_dlpack(grid_buffer).view(dtype=np.uint32)
+    cluster_dims = np.from_dlpack(cluster_buffer).view(dtype=np.uint32)
+    block_dims = np.from_dlpack(block_buffer).view(dtype=np.uint32)
+
+    # initialize arrays to zero
+    grid_dims[:] = 0
+    cluster_dims[:] = 0
+    block_dims[:] = 0
+
+    # launch kernel on the default stream
+    launch(dev.default_stream, config, ker, grid_buffer, cluster_buffer, block_buffer)
+    dev.sync()
+
+    # verify results
+    print("\nResults stored in pinned memory:")
+    print(f"Grid dimensions (blocks): {tuple(grid_dims)}")
+    print(f"Cluster dimensions: {tuple(cluster_dims)}")
+    print(f"Block dimensions (threads): {tuple(block_dims)}")
+
+    # verify that grid conversion worked correctly:
+    # LaunchConfig(grid=4, cluster=2) should result in 8 total blocks (4 clusters * 2 blocks/cluster)
+    expected_grid_blocks = grid * cluster  # 4 * 2 = 8
+    actual_grid_blocks = grid_dims[0]
+
+    print("\nVerification:")
+    print(f"LaunchConfig specified: grid={grid} clusters, cluster={cluster} blocks/cluster")
+    print(f"Expected total blocks: {expected_grid_blocks}")
+    print(f"Actual total blocks: {actual_grid_blocks}")
+
+    if actual_grid_blocks == expected_grid_blocks:
+        print("✓ Grid conversion is correct!")
+    else:
+        print("✗ Grid conversion failed!")
+        sys.exit(1)
+
+    print("done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index d31ab77208..3db719f0a1 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -12,53 +12,55 @@
 import cupy as cp
 from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
-# compute c = a + b
-code = """
-template<typename T>
-__global__ void vector_add(const T* A,
-                           const T* B,
-                           T* C,
-                           size_t N) {
-    const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
-        C[i] = A[i] + B[i];
+
+def main():
+    # compute c = a + b
+    code = """
+    template<typename T>
+    __global__ void vector_add(const T* A, const T* B, T* C, size_t N) {
+        const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
+        for (size_t i = tid; i < N; i += gridDim.x * blockDim.x) {
+            C[i] = A[i] + B[i];
+        }
     }
-}
-"""
+    """
+
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
 
+    # prepare program
+    program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
 
-dev = Device()
-dev.set_current()
-s = dev.create_stream()
+    # run in single precision
+    ker = mod.get_kernel("vector_add<float>")
+    dtype = cp.float32
 
-# prepare program
-program_options = ProgramOptions(std="c++17", arch=f"sm_{dev.arch}")
-prog = Program(code, code_type="c++", options=program_options)
-mod = prog.compile("cubin", name_expressions=("vector_add<float>",))
+    # prepare input/output
+    size = 50000
+    rng = cp.random.default_rng()
+    a = rng.random(size, dtype=dtype)
+    b = rng.random(size, dtype=dtype)
+    c = cp.empty_like(a)
 
-# run in single precision
-ker = mod.get_kernel("vector_add<float>")
-dtype = cp.float32
+    # cupy runs on a different stream from s, so sync before accessing
+    dev.sync()
 
-# prepare input/output
-size = 50000
-rng = cp.random.default_rng()
-a = rng.random(size, dtype=dtype)
-b = rng.random(size, dtype=dtype)
-c = cp.empty_like(a)
+    # prepare launch
+    block = 256
+    grid = (size + block - 1) // block
+    config = LaunchConfig(grid=grid, block=block)
 
-# cupy runs on a different stream from s, so sync before accessing
-dev.sync()
+    # launch kernel on stream s
+    launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
+    s.sync()
 
-# prepare launch
-block = 256
-grid = (size + block - 1) // block
-config = LaunchConfig(grid=grid, block=block)
+    # check result
+    assert cp.allclose(c, a + b)
+    print("done!")
 
-# launch kernel on stream s
-launch(s, config, ker, a.data.ptr, b.data.ptr, c.data.ptr, cp.uint64(size))
-s.sync()
 
-# check result
-assert cp.allclose(c, a + b)
-print("done!")
+if __name__ == "__main__":
+    main()