DiamondLightSource · mfep · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 17, 2026
diff --git a/httomolibgpu/cuda_kernels/calc_metrics.cu b/httomolibgpu/cuda_kernels/calc_metrics.cu
@@ -321,7 +321,7 @@ __device__ void _calc_metrics_overlap(const float *mat1, int mat1_nx,
  * the compiler (rather than at runtime), which reduces the register count.
  */
 template <bool norm, bool use_overlap>
-__global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
+__device__ __forceinline__ void calc_metrics_kernel_impl(const float *mat1, int mat1_nx,
                                     const float *mat2, int mat2_nx,
                                     int win_width, int rows, int side,
                                     float *list_metric)
@@ -331,4 +331,28 @@ __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
     } else {
         _calc_metrics_no_overlap<norm>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
     }
-}
+}
+
+extern "C" __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                               int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<false, false>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
+
+extern "C" __global__ void calc_metrics_kernel_norm(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                                    int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<true, false>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
+
+extern "C" __global__ void calc_metrics_kernel_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                                           int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<false, true>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
+
+extern "C" __global__ void calc_metrics_kernel_norm_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                                                int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<true, true>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
diff --git a/httomolibgpu/cuda_kernels/median_kernel.cu b/httomolibgpu/cuda_kernels/median_kernel.cu
@@ -1,6 +1,6 @@
 template <typename Type, int diameter>
-__global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
-                                      int Z, int M, int N) {
+__device__ __forceinline__ void median_general_kernel3d_impl(const Type *in, Type *out, float dif,
+                                                             int Z, int M, int N) {
   constexpr int radius = diameter / 2;
   constexpr int d3 = diameter * diameter * diameter;
   constexpr int midpoint = d3 / 2;
@@ -52,3 +52,63 @@ __global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
   }
   else out[index] = ValVec[midpoint]; /* median filtering */
 }
+
+extern "C" __global__ void median_general_kernel3d_float_3(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 3>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_5(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 5>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_7(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 7>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_9(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 9>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_11(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 11>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_13(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 13>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_3(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 3>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_5(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 5>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_7(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 7>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_9(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 9>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_11(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 11>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_13(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 13>(in, out, dif, Z, M, N);
+}
diff --git a/httomolibgpu/cuda_kernels/raven_filter.cu b/httomolibgpu/cuda_kernels/raven_filter.cu
@@ -1,8 +1,8 @@
 #include <cupy/complex.cuh>
 
 template <typename Type>
-__global__ void 
-raven_filter(
+__device__ __forceinline__ void 
+raven_filter_impl(
   complex<Type> *input,
   complex<Type> *output,
   int width, int images, int height, 
@@ -47,3 +47,19 @@ raven_filter(
 
   output[outIndex] = value;
 }
+
+extern "C" __global__ void raven_filter_float(complex<float> *input,
+                                              complex<float> *output,
+                                              int width, int images, int height, 
+                                              int u0, int n, int v0)
+{
+  raven_filter_impl<float>(input, output, width, images, height, u0, n, v0);
+}
+
+extern "C" __global__ void raven_filter_double(complex<double> *input,
+                                               complex<double> *output,
+                                               int width, int images, int height, 
+                                               int u0, int n, int v0)
+{
+  raven_filter_impl<double>(input, output, width, images, height, u0, n, v0);
+}
diff --git a/httomolibgpu/cuda_kernels/remove_nan_inf.cu b/httomolibgpu/cuda_kernels/remove_nan_inf.cu
@@ -1,5 +1,5 @@
 template <typename Type>
-__global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) {
+__device__ __forceinline__ void remove_nan_inf_impl(Type *data, int Z, int M, int N, int *result) {
   const long i = blockDim.x * blockIdx.x + threadIdx.x;
   const long j = blockDim.y * blockIdx.y + threadIdx.y;
   const long k = blockDim.z * blockIdx.z + threadIdx.z;
@@ -16,4 +16,14 @@ __global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) {
     data[index] = zero;
   }
 
-}
+}
+
+extern "C" __global__ void remove_nan_inf_float(float *data, int Z, int M, int N, int *result)
+{
+  remove_nan_inf_impl<float>(data, Z, M, N, result);
+}
+
+extern "C" __global__ void remove_nan_inf_unsigned_short(unsigned short *data, int Z, int M, int N, int *result)
+{
+  remove_nan_inf_impl<unsigned short>(data, Z, M, N, result);
+}
diff --git a/httomolibgpu/cuda_kernels/remove_stripe_fw.cu b/httomolibgpu/cuda_kernels/remove_stripe_fw.cu
@@ -1,5 +1,4 @@
-template<int WSize>
-__global__ void grouped_convolution_x(
+extern "C" __global__ void grouped_convolution_x(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -10,7 +9,8 @@ __global__ void grouped_convolution_x(
     float* out,
     int out_stride_z,
     int out_stride_group,
-    const float* w
+    const float* w,
+    int wk
 )
 {
     const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -25,9 +25,9 @@ __global__ void grouped_convolution_x(
     for (int i = 0; i < out_groups; ++i)
     {
         float acc = 0.F;
-        for (int j = 0; j < WSize; ++j)
+        for (int j = 0; j < wk; ++j)
         {
-            const int w_idx = i * WSize + j;
+            const int w_idx = i * wk + j;
             const int in_idx = (g_thd_x * in_stride_x + j) + g_thd_y * in_stride_y + g_thd_z * in_stride_z;
             acc += w[w_idx] * in[in_idx];
         }
@@ -36,8 +36,7 @@ __global__ void grouped_convolution_x(
     }
 }
 
-template<int WSize>
-__global__ void grouped_convolution_y(
+extern "C" __global__ void grouped_convolution_y(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -49,7 +48,8 @@ __global__ void grouped_convolution_y(
     float* out,
     int out_stride_z,
     int out_stride_group,
-    const float* w
+    const float* w,
+    int hk
 )
 {
     const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -68,9 +68,9 @@ __global__ void grouped_convolution_y(
         for (int i = 0; i < out_groups; ++i)
         {
             float acc = 0.F;
-            for (int j = 0; j < WSize; ++j)
+            for (int j = 0; j < hk; ++j)
             {
-                const int w_idx = (out_groups * group + i) * WSize + j;
+                const int w_idx = (out_groups * group + i) * hk + j;
                 const int in_idx = g_thd_x * in_stride_x + (item_stride_y * g_thd_y + j) * in_stride_y + group * in_stride_group + g_thd_z * in_stride_z;
                 acc += w[w_idx] * in[in_idx];
             }
@@ -80,8 +80,7 @@ __global__ void grouped_convolution_y(
     }
 }
 
-template<int WSize>
-__global__ void transposed_convolution_x(
+extern "C" __global__ void transposed_convolution_x(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -90,6 +89,7 @@ __global__ void transposed_convolution_x(
     int in_stride_y,
     int in_stride_z,
     const float* w,
+    int wk,
     float* out
 )
 {
@@ -103,7 +103,7 @@ __global__ void transposed_convolution_x(
 
     constexpr int item_out_stride = 2;
     float acc = 0.F;
-    for (int i = 0; i < WSize; ++i)
+    for (int i = 0; i < wk; ++i)
     {
         const int in_x = (g_thd_x - i) / item_out_stride;
         const int in_x_mod = (g_thd_x - i) % item_out_stride;
@@ -117,8 +117,7 @@ __global__ void transposed_convolution_x(
     out[out_idx] = acc;
 }
 
-template<int WSize>
-__global__ void transposed_convolution_y(
+extern "C" __global__ void transposed_convolution_y(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -127,6 +126,7 @@ __global__ void transposed_convolution_y(
     int in_stride_y,
     int in_stride_z,
     const float* w,
+    int hk,
     float* out
 )
 {
@@ -140,7 +140,7 @@ __global__ void transposed_convolution_y(
 
     constexpr int item_out_stride = 2;
     float acc = 0.F;
-    for (int i = 0; i < WSize; ++i)
+    for (int i = 0; i < hk; ++i)
     {
         const int in_y = (g_thd_y - i) / item_out_stride;
         const int in_y_mod = (g_thd_y - i) % item_out_stride;

diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
@@ -87,8 +87,8 @@ def median_filter(
     output = cp.copy(data, order="C")
 
     # 3d median or dezinger
-    kernel_args = "median_general_kernel3d<{0}, {1}>".format(
-        "float" if input_type == "float32" else "unsigned short", kernel_size
+    kernel_name = "median_general_kernel3d_{0}_{1}".format(
+        "float" if input_type == "float32" else "unsigned_short", kernel_size
     )
     block_x = 128
     # setting grid/block parameters
@@ -99,8 +99,8 @@ def median_filter(
     grid_dims = (grid_x, grid_y, grid_z)
     params = (data, output, cp.float32(dif), dz, dy, dx)
 
-    median_module = load_cuda_module("median_kernel", name_expressions=[kernel_args])
-    median_filt = median_module.get_function(kernel_args)
+    median_module = load_cuda_module("median_kernel")
+    median_filt = median_module.get_function(kernel_name)
 
     median_filt(grid_dims, block_dims, params)
 

diff --git a/httomolibgpu/misc/utils.py b/httomolibgpu/misc/utils.py
@@ -125,12 +125,12 @@ def __naninfs_check(
     grid_dims = (grid_x, grid_y, grid_z)
     params = (data, dz, dy, dx, present_nans_infs)
 
-    kernel_args = "remove_nan_inf<{0}>".format(
-        "float" if input_type == "float32" else "unsigned short"
+    kernel_name = "remove_nan_inf_{0}".format(
+        "float" if input_type == "float32" else "unsigned_short"
     )
 
-    module = load_cuda_module("remove_nan_inf", name_expressions=[kernel_args])
-    remove_nan_inf_kernel = module.get_function(kernel_args)
+    module = load_cuda_module("remove_nan_inf")
+    remove_nan_inf_kernel = module.get_function(kernel_name)
     remove_nan_inf_kernel(grid_dims, block_dims, params)
 
     if present_nans_infs[0].get() == 1: