From f8a35cc62efa27029c83a65231f6609bf72dc00e Mon Sep 17 00:00:00 2001
From: Alexander Droste <alexander.droste@protonmail.com>
Date: Mon, 29 Jun 2026 13:46:35 +0000
Subject: [PATCH] Fix CUDA bitpacked sliced output allocation

Decode sliced bit-packed arrays in padded coordinates by sizing and launching for offset + len. This keeps the returned offset..offset+len device slice in bounds and ensures the final touched 1024-value chunk is decoded.

Signed-off-by: "Alexander Droste" <alexander.droste@protonmail.com>
---
 vortex-cuda/src/kernel/encodings/bitpacked.rs | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs
index 6820050b72f..0f00156058b 100644
--- a/vortex-cuda/src/kernel/encodings/bitpacked.rs
+++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs
@@ -177,14 +177,16 @@ where
     // Get CUDA view of input
     let input_view = device_input.cuda_view::<A::Physical>()?;
 
+    let output_len = offset + len;
+
     // Allocate output buffer
-    let output_slice = ctx.device_alloc::<A>(len.next_multiple_of(1024))?;
+    let output_slice = ctx.device_alloc::<A>(output_len.next_multiple_of(1024))?;
     let output_buf = CudaDeviceBuffer::new(output_slice);
     let output_view = output_buf.as_view::<A>();
 
     let output_width = size_of::<A>() * 8;
     let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, ctx)?;
-    let config = bitpacked_cuda_launch_config(output_width, len)?;
+    let config = bitpacked_cuda_launch_config(output_width, output_len)?;
 
     // We hold this here to keep the device buffers alive.
     let device_patches = if let Some(patches) = patches {
@@ -199,7 +201,7 @@ where
 
     let patches_arg = build_gpu_patches(device_patches.as_ref())?;
 
-    ctx.launch_kernel_config(&cuda_function, config, len, |args| {
+    ctx.launch_kernel_config(&cuda_function, config, output_len, |args| {
         args.arg(&input_view)
             .arg(&output_view)
             .arg(&reference)
@@ -552,8 +554,14 @@ mod tests {
         Ok(())
     }
 
+    #[rstest]
+    #[case(67, 3969)]
+    #[case(1, 1025)]
     #[crate::test]
-    fn test_cuda_bitunpack_sliced() -> VortexResult<()> {
+    fn test_cuda_bitunpack_sliced(
+        #[case] slice_start: usize,
+        #[case] slice_end: usize,
+    ) -> VortexResult<()> {
         let mut ctx = vortex_array::array_session().create_execution_ctx();
         let bit_width = 32;
         let mut cuda_ctx = CudaSession::create_execution_ctx(&crate::cuda_session())
@@ -570,7 +578,7 @@ mod tests {
 
         let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), bit_width, &mut ctx)
             .vortex_expect("operation should succeed in test");
-        let sliced_array = bitpacked_array.into_array().slice(67..3969)?;
+        let sliced_array = bitpacked_array.into_array().slice(slice_start..slice_end)?;
         let gpu_result = block_on(async {
             BitPackedExecutor
                 .execute(sliced_array.clone(), &mut cuda_ctx)