diff --git a/httomolibgpu/cuda_kernels/calc_metrics.cu b/httomolibgpu/cuda_kernels/calc_metrics.cu
index 5027d1df..fd0c1365 100644
--- a/httomolibgpu/cuda_kernels/calc_metrics.cu
+++ b/httomolibgpu/cuda_kernels/calc_metrics.cu
@@ -321,7 +321,7 @@ __device__ void _calc_metrics_overlap(const float *mat1, int mat1_nx,
  * the compiler (rather than at runtime), which reduces the register count.
  */
 template <bool norm, bool use_overlap>
-__global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
+__device__ __forceinline__ void calc_metrics_kernel_impl(const float *mat1, int mat1_nx,
                                     const float *mat2, int mat2_nx,
                                     int win_width, int rows, int side,
                                     float *list_metric)
@@ -331,4 +331,28 @@ __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
     } else {
         _calc_metrics_no_overlap<norm>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
     }
-}
\ No newline at end of file
+}
+
+extern "C" __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                               int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<false, false>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
+
+extern "C" __global__ void calc_metrics_kernel_norm(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                                    int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<true, false>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
+
+extern "C" __global__ void calc_metrics_kernel_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                                           int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<false, true>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
+
+extern "C" __global__ void calc_metrics_kernel_norm_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
+                                                                int win_width, int rows, int side, float *list_metric)
+{
+    calc_metrics_kernel_impl<true, true>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
+}
diff --git a/httomolibgpu/cuda_kernels/median_kernel.cu b/httomolibgpu/cuda_kernels/median_kernel.cu
index 8cdbbb64..b3bae1ca 100644
--- a/httomolibgpu/cuda_kernels/median_kernel.cu
+++ b/httomolibgpu/cuda_kernels/median_kernel.cu
@@ -1,6 +1,6 @@
 template <typename Type, int diameter>
-__global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
-                                      int Z, int M, int N) {
+__device__ __forceinline__ void median_general_kernel3d_impl(const Type *in, Type *out, float dif,
+                                                             int Z, int M, int N) {
   constexpr int radius = diameter / 2;
   constexpr int d3 = diameter * diameter * diameter;
   constexpr int midpoint = d3 / 2;
@@ -52,3 +52,63 @@ __global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
   }
   else out[index] = ValVec[midpoint]; /* median filtering */
 }
+
+extern "C" __global__ void median_general_kernel3d_float_3(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 3>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_5(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 5>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_7(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 7>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_9(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 9>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_11(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 11>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_float_13(const float *in, float *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<float, 13>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_3(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 3>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_5(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 5>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_7(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 7>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_9(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 9>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_11(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 11>(in, out, dif, Z, M, N);
+}
+
+extern "C" __global__ void median_general_kernel3d_unsigned_short_13(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
+{
+  median_general_kernel3d_impl<unsigned short, 13>(in, out, dif, Z, M, N);
+}
diff --git a/httomolibgpu/cuda_kernels/raven_filter.cu b/httomolibgpu/cuda_kernels/raven_filter.cu
index 5892c6d8..e2e0fed0 100644
--- a/httomolibgpu/cuda_kernels/raven_filter.cu
+++ b/httomolibgpu/cuda_kernels/raven_filter.cu
@@ -1,8 +1,8 @@
 #include <cupy/complex.cuh>
 
 template <typename Type>
-__global__ void 
-raven_filter(
+__device__ __forceinline__ void 
+raven_filter_impl(
   complex<Type> *input,
   complex<Type> *output,
   int width, int images, int height, 
@@ -47,3 +47,19 @@ raven_filter(
 
   output[outIndex] = value;
 }
+
+extern "C" __global__ void raven_filter_float(complex<float> *input,
+                                              complex<float> *output,
+                                              int width, int images, int height, 
+                                              int u0, int n, int v0)
+{
+  raven_filter_impl<float>(input, output, width, images, height, u0, n, v0);
+}
+
+extern "C" __global__ void raven_filter_double(complex<double> *input,
+                                               complex<double> *output,
+                                               int width, int images, int height, 
+                                               int u0, int n, int v0)
+{
+  raven_filter_impl<double>(input, output, width, images, height, u0, n, v0);
+}
diff --git a/httomolibgpu/cuda_kernels/remove_nan_inf.cu b/httomolibgpu/cuda_kernels/remove_nan_inf.cu
index 14e4584d..f3d89ad3 100644
--- a/httomolibgpu/cuda_kernels/remove_nan_inf.cu
+++ b/httomolibgpu/cuda_kernels/remove_nan_inf.cu
@@ -1,5 +1,5 @@
 template <typename Type>
-__global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) {
+__device__ __forceinline__ void remove_nan_inf_impl(Type *data, int Z, int M, int N, int *result) {
   const long i = blockDim.x * blockIdx.x + threadIdx.x;
   const long j = blockDim.y * blockIdx.y + threadIdx.y;
   const long k = blockDim.z * blockIdx.z + threadIdx.z;
@@ -16,4 +16,14 @@ __global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) {
     data[index] = zero;
   }
 
-}
\ No newline at end of file
+}
+
+extern "C" __global__ void remove_nan_inf_float(float *data, int Z, int M, int N, int *result)
+{
+  remove_nan_inf_impl<float>(data, Z, M, N, result);
+}
+
+extern "C" __global__ void remove_nan_inf_unsigned_short(unsigned short *data, int Z, int M, int N, int *result)
+{
+  remove_nan_inf_impl<unsigned short>(data, Z, M, N, result);
+}
diff --git a/httomolibgpu/cuda_kernels/remove_stripe_fw.cu b/httomolibgpu/cuda_kernels/remove_stripe_fw.cu
index 16597f4e..4a7bd13b 100644
--- a/httomolibgpu/cuda_kernels/remove_stripe_fw.cu
+++ b/httomolibgpu/cuda_kernels/remove_stripe_fw.cu
@@ -1,5 +1,4 @@
-template<int WSize>
-__global__ void grouped_convolution_x(
+extern "C" __global__ void grouped_convolution_x(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -10,7 +9,8 @@ __global__ void grouped_convolution_x(
     float* out,
     int out_stride_z,
     int out_stride_group,
-    const float* w
+    const float* w,
+    int wk
 )
 {
     const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -25,9 +25,9 @@ __global__ void grouped_convolution_x(
     for (int i = 0; i < out_groups; ++i)
     {
         float acc = 0.F;
-        for (int j = 0; j < WSize; ++j)
+        for (int j = 0; j < wk; ++j)
         {
-            const int w_idx = i * WSize + j;
+            const int w_idx = i * wk + j;
             const int in_idx = (g_thd_x * in_stride_x + j) + g_thd_y * in_stride_y + g_thd_z * in_stride_z;
             acc += w[w_idx] * in[in_idx];
         }
@@ -36,8 +36,7 @@ __global__ void grouped_convolution_x(
     }
 }
 
-template<int WSize>
-__global__ void grouped_convolution_y(
+extern "C" __global__ void grouped_convolution_y(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -49,7 +48,8 @@ __global__ void grouped_convolution_y(
     float* out,
     int out_stride_z,
     int out_stride_group,
-    const float* w
+    const float* w,
+    int hk
 )
 {
     const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -68,9 +68,9 @@ __global__ void grouped_convolution_y(
         for (int i = 0; i < out_groups; ++i)
         {
             float acc = 0.F;
-            for (int j = 0; j < WSize; ++j)
+            for (int j = 0; j < hk; ++j)
             {
-                const int w_idx = (out_groups * group + i) * WSize + j;
+                const int w_idx = (out_groups * group + i) * hk + j;
                 const int in_idx = g_thd_x * in_stride_x + (item_stride_y * g_thd_y + j) * in_stride_y + group * in_stride_group + g_thd_z * in_stride_z;
                 acc += w[w_idx] * in[in_idx];
             }
@@ -80,8 +80,7 @@ __global__ void grouped_convolution_y(
     }
 }
 
-template<int WSize>
-__global__ void transposed_convolution_x(
+extern "C" __global__ void transposed_convolution_x(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -90,6 +89,7 @@ __global__ void transposed_convolution_x(
     int in_stride_y,
     int in_stride_z,
     const float* w,
+    int wk,
     float* out
 )
 {
@@ -103,7 +103,7 @@ __global__ void transposed_convolution_x(
 
     constexpr int item_out_stride = 2;
     float acc = 0.F;
-    for (int i = 0; i < WSize; ++i)
+    for (int i = 0; i < wk; ++i)
     {
         const int in_x = (g_thd_x - i) / item_out_stride;
         const int in_x_mod = (g_thd_x - i) % item_out_stride;
@@ -117,8 +117,7 @@ __global__ void transposed_convolution_x(
     out[out_idx] = acc;
 }
 
-template<int WSize>
-__global__ void transposed_convolution_y(
+extern "C" __global__ void transposed_convolution_y(
     int dim_x,
     int dim_y,
     int dim_z,
@@ -127,6 +126,7 @@ __global__ void transposed_convolution_y(
     int in_stride_y,
     int in_stride_z,
     const float* w,
+    int hk,
     float* out
 )
 {
@@ -140,7 +140,7 @@ __global__ void transposed_convolution_y(
 
     constexpr int item_out_stride = 2;
     float acc = 0.F;
-    for (int i = 0; i < WSize; ++i)
+    for (int i = 0; i < hk; ++i)
     {
         const int in_y = (g_thd_y - i) / item_out_stride;
         const int in_y_mod = (g_thd_y - i) % item_out_stride;
diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py
index 95d1b71a..8a13fc70 100644
--- a/httomolibgpu/misc/corr.py
+++ b/httomolibgpu/misc/corr.py
@@ -87,8 +87,8 @@ def median_filter(
     output = cp.copy(data, order="C")
 
     # 3d median or dezinger
-    kernel_args = "median_general_kernel3d<{0}, {1}>".format(
-        "float" if input_type == "float32" else "unsigned short", kernel_size
+    kernel_name = "median_general_kernel3d_{0}_{1}".format(
+        "float" if input_type == "float32" else "unsigned_short", kernel_size
     )
     block_x = 128
     # setting grid/block parameters
@@ -99,8 +99,8 @@ def median_filter(
     grid_dims = (grid_x, grid_y, grid_z)
     params = (data, output, cp.float32(dif), dz, dy, dx)
 
-    median_module = load_cuda_module("median_kernel", name_expressions=[kernel_args])
-    median_filt = median_module.get_function(kernel_args)
+    median_module = load_cuda_module("median_kernel")
+    median_filt = median_module.get_function(kernel_name)
 
     median_filt(grid_dims, block_dims, params)
 
diff --git a/httomolibgpu/misc/utils.py b/httomolibgpu/misc/utils.py
index 6a6b85bf..cf5f6f2c 100644
--- a/httomolibgpu/misc/utils.py
+++ b/httomolibgpu/misc/utils.py
@@ -125,12 +125,12 @@ def __naninfs_check(
     grid_dims = (grid_x, grid_y, grid_z)
     params = (data, dz, dy, dx, present_nans_infs)
 
-    kernel_args = "remove_nan_inf<{0}>".format(
-        "float" if input_type == "float32" else "unsigned short"
+    kernel_name = "remove_nan_inf_{0}".format(
+        "float" if input_type == "float32" else "unsigned_short"
     )
 
-    module = load_cuda_module("remove_nan_inf", name_expressions=[kernel_args])
-    remove_nan_inf_kernel = module.get_function(kernel_args)
+    module = load_cuda_module("remove_nan_inf")
+    remove_nan_inf_kernel = module.get_function(kernel_name)
     remove_nan_inf_kernel(grid_dims, block_dims, params)
 
     if present_nans_infs[0].get() == 1:
diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py
index 3418b89a..c4611781 100644
--- a/httomolibgpu/prep/stripe.py
+++ b/httomolibgpu/prep/stripe.py
@@ -288,8 +288,7 @@ def _conv2d(
     w = cp.asarray(w)
     x = cp.expand_dims(x, axis=1)
     w = np.expand_dims(w, axis=0)
-    symbol_names = [f"grouped_convolution_x<{wk}>", f"grouped_convolution_y<{hk}>"]
-    module = load_cuda_module("remove_stripe_fw", name_expressions=symbol_names)
+    module = load_cuda_module("remove_stripe_fw")
     dim_x = out.shape[-1]
     dim_y = out.shape[-2]
     dim_z = out.shape[0]
@@ -305,7 +304,7 @@ def _conv2d(
     grid_dim = (grid_x, dim_y, dim_z)
 
     if groups == 1:
-        grouped_convolution_kernel_x = module.get_function(symbol_names[0])
+        grouped_convolution_kernel_x = module.get_function("grouped_convolution_x")
         grouped_convolution_kernel_x(
             grid_dim,
             block_dim,
@@ -321,11 +320,12 @@ def _conv2d(
                 out_stride_z,
                 out_stride_group,
                 w,
+                wk,
             ),
         )
         return out
 
-    grouped_convolution_kernel_y = module.get_function(symbol_names[1])
+    grouped_convolution_kernel_y = module.get_function("grouped_convolution_y")
     in_stride_group = x.strides[2] // x.dtype.itemsize
     grouped_convolution_kernel_y(
         grid_dim,
@@ -343,6 +343,7 @@ def _conv2d(
             out_stride_z,
             out_stride_group,
             w,
+            hk,
         ),
     )
     del w
@@ -383,11 +384,7 @@ def _conv_transpose2d(
     out = cp.zeros(out_shape, dtype="float32")
     w = cp.asarray(w)
 
-    symbol_names = [
-        f"transposed_convolution_x<{wk}>",
-        f"transposed_convolution_y<{hk}>",
-    ]
-    module = load_cuda_module("remove_stripe_fw", name_expressions=symbol_names)
+    module = load_cuda_module("remove_stripe_fw")
     dim_x = out.shape[-1]
     dim_y = out.shape[-2]
     dim_z = out.shape[0]
@@ -402,18 +399,44 @@ def _conv_transpose2d(
     grid_dim = (grid_x, dim_y, dim_z)
 
     if wk > 1:
-        transposed_convolution_kernel_x = module.get_function(symbol_names[0])
+        transposed_convolution_kernel_x = module.get_function(
+            "transposed_convolution_x"
+        )
         transposed_convolution_kernel_x(
             grid_dim,
             block_dim,
-            (dim_x, dim_y, dim_z, x, in_dim_x, in_stride_y, in_stride_z, w, out),
+            (
+                dim_x,
+                dim_y,
+                dim_z,
+                x,
+                in_dim_x,
+                in_stride_y,
+                in_stride_z,
+                w,
+                wk,
+                out,
+            ),
         )
     elif hk > 1:
-        transposed_convolution_kernel_y = module.get_function(symbol_names[1])
+        transposed_convolution_kernel_y = module.get_function(
+            "transposed_convolution_y"
+        )
         transposed_convolution_kernel_y(
             grid_dim,
             block_dim,
-            (dim_x, dim_y, dim_z, x, in_dim_y, in_stride_y, in_stride_z, w, out),
+            (
+                dim_x,
+                dim_y,
+                dim_z,
+                x,
+                in_dim_y,
+                in_stride_y,
+                in_stride_z,
+                w,
+                hk,
+                out,
+            ),
         )
     else:
         assert False
@@ -1001,7 +1024,7 @@ def raven_filter(
     height, images, width = data.shape
 
     # Set the input type of the kernel
-    kernel_args = "raven_filter<{0}>".format(
+    kernel_name = "raven_filter_{0}".format(
         "float" if calc_type == "complex64" else "double"
     )
 
@@ -1014,8 +1037,8 @@ def raven_filter(
     grid_dims = (grid_x, grid_y, grid_z)
     params = (fft_data_shifted, fft_data, width, images, height, uvalue, nvalue, vvalue)
 
-    raven_module = load_cuda_module("raven_filter", name_expressions=[kernel_args])
-    raven_filt = raven_module.get_function(kernel_args)
+    raven_module = load_cuda_module("raven_filter")
+    raven_filt = raven_module.get_function(kernel_name)
 
     raven_filt(grid_dims, block_dims, params)
     del fft_data_shifted
diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py
index c3aa47d2..41176b55 100644
--- a/httomolibgpu/recon/rotation.py
+++ b/httomolibgpu/recon/rotation.py
@@ -666,12 +666,6 @@ def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
 
     _calc_metrics_module = load_cuda_module(
         "calc_metrics",
-        name_expressions=[
-            "calc_metrics_kernel<false, false>",
-            "calc_metrics_kernel<true, false>",
-            "calc_metrics_kernel<false, true>",
-            "calc_metrics_kernel<true, true>",
-        ],
         options=("--maxrregcount=32",),
     )
 
@@ -691,10 +685,12 @@ def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm):
     block = (128, 1, 1)
     grid = (1, np.int32(num_pos), 1)
     smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3
-    bool2str = lambda x: "true" if x is True else "false"
-    calc_metrics = _calc_metrics_module.get_function(
-        f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>"
-    )
+    calc_metrics_kernel_name = "calc_metrics_kernel"
+    if norm:
+        calc_metrics_kernel_name += "_norm"
+    if use_overlap:
+        calc_metrics_kernel_name += "_use_overlap"
+    calc_metrics = _calc_metrics_module.get_function(calc_metrics_kernel_name)
     calc_metrics(grid=grid, block=block, args=args, shared_mem=smem)
 
     return list_metric