From f8a35cc62efa27029c83a65231f6609bf72dc00e Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Mon, 29 Jun 2026 13:46:35 +0000 Subject: [PATCH] Fix CUDA bitpacked sliced output allocation Decode sliced bit-packed arrays in padded coordinates by sizing and launching for offset + len. This keeps the returned offset..offset+len device slice in bounds and ensures the final touched 1024-value chunk is decoded. Signed-off-by: "Alexander Droste" --- vortex-cuda/src/kernel/encodings/bitpacked.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs index 6820050b72f..0f00156058b 100644 --- a/vortex-cuda/src/kernel/encodings/bitpacked.rs +++ b/vortex-cuda/src/kernel/encodings/bitpacked.rs @@ -177,14 +177,16 @@ where // Get CUDA view of input let input_view = device_input.cuda_view::()?; + let output_len = offset + len; + // Allocate output buffer - let output_slice = ctx.device_alloc::(len.next_multiple_of(1024))?; + let output_slice = ctx.device_alloc::(output_len.next_multiple_of(1024))?; let output_buf = CudaDeviceBuffer::new(output_slice); let output_view = output_buf.as_view::(); let output_width = size_of::() * 8; let cuda_function = bitpacked_cuda_kernel(bit_width, output_width, ctx)?; - let config = bitpacked_cuda_launch_config(output_width, len)?; + let config = bitpacked_cuda_launch_config(output_width, output_len)?; // We hold this here to keep the device buffers alive. let device_patches = if let Some(patches) = patches { @@ -199,7 +201,7 @@ where let patches_arg = build_gpu_patches(device_patches.as_ref())?; - ctx.launch_kernel_config(&cuda_function, config, len, |args| { + ctx.launch_kernel_config(&cuda_function, config, output_len, |args| { args.arg(&input_view) .arg(&output_view) .arg(&reference) @@ -552,8 +554,14 @@ mod tests { Ok(()) } + #[rstest] + #[case(67, 3969)] + #[case(1, 1025)] #[crate::test] - fn test_cuda_bitunpack_sliced() -> VortexResult<()> { + fn test_cuda_bitunpack_sliced( + #[case] slice_start: usize, + #[case] slice_end: usize, + ) -> VortexResult<()> { let mut ctx = vortex_array::array_session().create_execution_ctx(); let bit_width = 32; let mut cuda_ctx = CudaSession::create_execution_ctx(&crate::cuda_session()) @@ -570,7 +578,7 @@ mod tests { let bitpacked_array = BitPacked::encode(&primitive_array.into_array(), bit_width, &mut ctx) .vortex_expect("operation should succeed in test"); - let sliced_array = bitpacked_array.into_array().slice(67..3969)?; + let sliced_array = bitpacked_array.into_array().slice(slice_start..slice_end)?; let gpu_result = block_on(async { BitPackedExecutor .execute(sliced_array.clone(), &mut cuda_ctx)