diff --git a/httomolibgpu/cuda_kernels/calc_metrics.cu b/httomolibgpu/cuda_kernels/calc_metrics.cu index 5027d1df..fd0c1365 100644 --- a/httomolibgpu/cuda_kernels/calc_metrics.cu +++ b/httomolibgpu/cuda_kernels/calc_metrics.cu @@ -321,7 +321,7 @@ __device__ void _calc_metrics_overlap(const float *mat1, int mat1_nx, * the compiler (rather than at runtime), which reduces the register count. */ template -__global__ void calc_metrics_kernel(const float *mat1, int mat1_nx, +__device__ __forceinline__ void calc_metrics_kernel_impl(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx, int win_width, int rows, int side, float *list_metric) @@ -331,4 +331,28 @@ __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx, } else { _calc_metrics_no_overlap(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric); } -} \ No newline at end of file +} + +extern "C" __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx, + int win_width, int rows, int side, float *list_metric) +{ + calc_metrics_kernel_impl(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric); +} + +extern "C" __global__ void calc_metrics_kernel_norm(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx, + int win_width, int rows, int side, float *list_metric) +{ + calc_metrics_kernel_impl(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric); +} + +extern "C" __global__ void calc_metrics_kernel_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx, + int win_width, int rows, int side, float *list_metric) +{ + calc_metrics_kernel_impl(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric); +} + +extern "C" __global__ void calc_metrics_kernel_norm_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx, + int win_width, int rows, int side, float *list_metric) +{ + calc_metrics_kernel_impl(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric); +} diff --git a/httomolibgpu/cuda_kernels/median_kernel.cu b/httomolibgpu/cuda_kernels/median_kernel.cu index 8cdbbb64..b3bae1ca 100644 --- a/httomolibgpu/cuda_kernels/median_kernel.cu +++ b/httomolibgpu/cuda_kernels/median_kernel.cu @@ -1,6 +1,6 @@ template -__global__ void median_general_kernel3d(const Type *in, Type *out, float dif, - int Z, int M, int N) { +__device__ __forceinline__ void median_general_kernel3d_impl(const Type *in, Type *out, float dif, + int Z, int M, int N) { constexpr int radius = diameter / 2; constexpr int d3 = diameter * diameter * diameter; constexpr int midpoint = d3 / 2; @@ -52,3 +52,63 @@ __global__ void median_general_kernel3d(const Type *in, Type *out, float dif, } else out[index] = ValVec[midpoint]; /* median filtering */ } + +extern "C" __global__ void median_general_kernel3d_float_3(const float *in, float *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_float_5(const float *in, float *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_float_7(const float *in, float *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_float_9(const float *in, float *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_float_11(const float *in, float *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_float_13(const float *in, float *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_unsigned_short_3(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_unsigned_short_5(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_unsigned_short_7(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_unsigned_short_9(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_unsigned_short_11(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} + +extern "C" __global__ void median_general_kernel3d_unsigned_short_13(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N) +{ + median_general_kernel3d_impl(in, out, dif, Z, M, N); +} diff --git a/httomolibgpu/cuda_kernels/raven_filter.cu b/httomolibgpu/cuda_kernels/raven_filter.cu index 5892c6d8..e2e0fed0 100644 --- a/httomolibgpu/cuda_kernels/raven_filter.cu +++ b/httomolibgpu/cuda_kernels/raven_filter.cu @@ -1,8 +1,8 @@ #include template -__global__ void -raven_filter( +__device__ __forceinline__ void +raven_filter_impl( complex *input, complex *output, int width, int images, int height, @@ -47,3 +47,19 @@ raven_filter( output[outIndex] = value; } + +extern "C" __global__ void raven_filter_float(complex *input, + complex *output, + int width, int images, int height, + int u0, int n, int v0) +{ + raven_filter_impl(input, output, width, images, height, u0, n, v0); +} + +extern "C" __global__ void raven_filter_double(complex *input, + complex *output, + int width, int images, int height, + int u0, int n, int v0) +{ + raven_filter_impl(input, output, width, images, height, u0, n, v0); +} diff --git a/httomolibgpu/cuda_kernels/remove_nan_inf.cu b/httomolibgpu/cuda_kernels/remove_nan_inf.cu index 14e4584d..f3d89ad3 100644 --- a/httomolibgpu/cuda_kernels/remove_nan_inf.cu +++ b/httomolibgpu/cuda_kernels/remove_nan_inf.cu @@ -1,5 +1,5 @@ template -__global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) { +__device__ __forceinline__ void remove_nan_inf_impl(Type *data, int Z, int M, int N, int *result) { const long i = blockDim.x * blockIdx.x + threadIdx.x; const long j = blockDim.y * blockIdx.y + threadIdx.y; const long k = blockDim.z * blockIdx.z + threadIdx.z; @@ -16,4 +16,14 @@ __global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) { data[index] = zero; } -} \ No newline at end of file +} + +extern "C" __global__ void remove_nan_inf_float(float *data, int Z, int M, int N, int *result) +{ + remove_nan_inf_impl(data, Z, M, N, result); +} + +extern "C" __global__ void remove_nan_inf_unsigned_short(unsigned short *data, int Z, int M, int N, int *result) +{ + remove_nan_inf_impl(data, Z, M, N, result); +} diff --git a/httomolibgpu/cuda_kernels/remove_stripe_fw.cu b/httomolibgpu/cuda_kernels/remove_stripe_fw.cu index 16597f4e..4a7bd13b 100644 --- a/httomolibgpu/cuda_kernels/remove_stripe_fw.cu +++ b/httomolibgpu/cuda_kernels/remove_stripe_fw.cu @@ -1,5 +1,4 @@ -template -__global__ void grouped_convolution_x( +extern "C" __global__ void grouped_convolution_x( int dim_x, int dim_y, int dim_z, @@ -10,7 +9,8 @@ __global__ void grouped_convolution_x( float* out, int out_stride_z, int out_stride_group, - const float* w + const float* w, + int wk ) { const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x; @@ -25,9 +25,9 @@ __global__ void grouped_convolution_x( for (int i = 0; i < out_groups; ++i) { float acc = 0.F; - for (int j = 0; j < WSize; ++j) + for (int j = 0; j < wk; ++j) { - const int w_idx = i * WSize + j; + const int w_idx = i * wk + j; const int in_idx = (g_thd_x * in_stride_x + j) + g_thd_y * in_stride_y + g_thd_z * in_stride_z; acc += w[w_idx] * in[in_idx]; } @@ -36,8 +36,7 @@ __global__ void grouped_convolution_x( } } -template -__global__ void grouped_convolution_y( +extern "C" __global__ void grouped_convolution_y( int dim_x, int dim_y, int dim_z, @@ -49,7 +48,8 @@ __global__ void grouped_convolution_y( float* out, int out_stride_z, int out_stride_group, - const float* w + const float* w, + int hk ) { const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x; @@ -68,9 +68,9 @@ __global__ void grouped_convolution_y( for (int i = 0; i < out_groups; ++i) { float acc = 0.F; - for (int j = 0; j < WSize; ++j) + for (int j = 0; j < hk; ++j) { - const int w_idx = (out_groups * group + i) * WSize + j; + const int w_idx = (out_groups * group + i) * hk + j; const int in_idx = g_thd_x * in_stride_x + (item_stride_y * g_thd_y + j) * in_stride_y + group * in_stride_group + g_thd_z * in_stride_z; acc += w[w_idx] * in[in_idx]; } @@ -80,8 +80,7 @@ __global__ void grouped_convolution_y( } } -template -__global__ void transposed_convolution_x( +extern "C" __global__ void transposed_convolution_x( int dim_x, int dim_y, int dim_z, @@ -90,6 +89,7 @@ __global__ void transposed_convolution_x( int in_stride_y, int in_stride_z, const float* w, + int wk, float* out ) { @@ -103,7 +103,7 @@ __global__ void transposed_convolution_x( constexpr int item_out_stride = 2; float acc = 0.F; - for (int i = 0; i < WSize; ++i) + for (int i = 0; i < wk; ++i) { const int in_x = (g_thd_x - i) / item_out_stride; const int in_x_mod = (g_thd_x - i) % item_out_stride; @@ -117,8 +117,7 @@ __global__ void transposed_convolution_x( out[out_idx] = acc; } -template -__global__ void transposed_convolution_y( +extern "C" __global__ void transposed_convolution_y( int dim_x, int dim_y, int dim_z, @@ -127,6 +126,7 @@ __global__ void transposed_convolution_y( int in_stride_y, int in_stride_z, const float* w, + int hk, float* out ) { @@ -140,7 +140,7 @@ __global__ void transposed_convolution_y( constexpr int item_out_stride = 2; float acc = 0.F; - for (int i = 0; i < WSize; ++i) + for (int i = 0; i < hk; ++i) { const int in_y = (g_thd_y - i) / item_out_stride; const int in_y_mod = (g_thd_y - i) % item_out_stride; diff --git a/httomolibgpu/misc/corr.py b/httomolibgpu/misc/corr.py index 95d1b71a..8a13fc70 100644 --- a/httomolibgpu/misc/corr.py +++ b/httomolibgpu/misc/corr.py @@ -87,8 +87,8 @@ def median_filter( output = cp.copy(data, order="C") # 3d median or dezinger - kernel_args = "median_general_kernel3d<{0}, {1}>".format( - "float" if input_type == "float32" else "unsigned short", kernel_size + kernel_name = "median_general_kernel3d_{0}_{1}".format( + "float" if input_type == "float32" else "unsigned_short", kernel_size ) block_x = 128 # setting grid/block parameters @@ -99,8 +99,8 @@ def median_filter( grid_dims = (grid_x, grid_y, grid_z) params = (data, output, cp.float32(dif), dz, dy, dx) - median_module = load_cuda_module("median_kernel", name_expressions=[kernel_args]) - median_filt = median_module.get_function(kernel_args) + median_module = load_cuda_module("median_kernel") + median_filt = median_module.get_function(kernel_name) median_filt(grid_dims, block_dims, params) diff --git a/httomolibgpu/misc/utils.py b/httomolibgpu/misc/utils.py index 6a6b85bf..cf5f6f2c 100644 --- a/httomolibgpu/misc/utils.py +++ b/httomolibgpu/misc/utils.py @@ -125,12 +125,12 @@ def __naninfs_check( grid_dims = (grid_x, grid_y, grid_z) params = (data, dz, dy, dx, present_nans_infs) - kernel_args = "remove_nan_inf<{0}>".format( - "float" if input_type == "float32" else "unsigned short" + kernel_name = "remove_nan_inf_{0}".format( + "float" if input_type == "float32" else "unsigned_short" ) - module = load_cuda_module("remove_nan_inf", name_expressions=[kernel_args]) - remove_nan_inf_kernel = module.get_function(kernel_args) + module = load_cuda_module("remove_nan_inf") + remove_nan_inf_kernel = module.get_function(kernel_name) remove_nan_inf_kernel(grid_dims, block_dims, params) if present_nans_infs[0].get() == 1: diff --git a/httomolibgpu/prep/stripe.py b/httomolibgpu/prep/stripe.py index 3418b89a..c4611781 100644 --- a/httomolibgpu/prep/stripe.py +++ b/httomolibgpu/prep/stripe.py @@ -288,8 +288,7 @@ def _conv2d( w = cp.asarray(w) x = cp.expand_dims(x, axis=1) w = np.expand_dims(w, axis=0) - symbol_names = [f"grouped_convolution_x<{wk}>", f"grouped_convolution_y<{hk}>"] - module = load_cuda_module("remove_stripe_fw", name_expressions=symbol_names) + module = load_cuda_module("remove_stripe_fw") dim_x = out.shape[-1] dim_y = out.shape[-2] dim_z = out.shape[0] @@ -305,7 +304,7 @@ def _conv2d( grid_dim = (grid_x, dim_y, dim_z) if groups == 1: - grouped_convolution_kernel_x = module.get_function(symbol_names[0]) + grouped_convolution_kernel_x = module.get_function("grouped_convolution_x") grouped_convolution_kernel_x( grid_dim, block_dim, @@ -321,11 +320,12 @@ def _conv2d( out_stride_z, out_stride_group, w, + wk, ), ) return out - grouped_convolution_kernel_y = module.get_function(symbol_names[1]) + grouped_convolution_kernel_y = module.get_function("grouped_convolution_y") in_stride_group = x.strides[2] // x.dtype.itemsize grouped_convolution_kernel_y( grid_dim, @@ -343,6 +343,7 @@ def _conv2d( out_stride_z, out_stride_group, w, + hk, ), ) del w @@ -383,11 +384,7 @@ def _conv_transpose2d( out = cp.zeros(out_shape, dtype="float32") w = cp.asarray(w) - symbol_names = [ - f"transposed_convolution_x<{wk}>", - f"transposed_convolution_y<{hk}>", - ] - module = load_cuda_module("remove_stripe_fw", name_expressions=symbol_names) + module = load_cuda_module("remove_stripe_fw") dim_x = out.shape[-1] dim_y = out.shape[-2] dim_z = out.shape[0] @@ -402,18 +399,44 @@ def _conv_transpose2d( grid_dim = (grid_x, dim_y, dim_z) if wk > 1: - transposed_convolution_kernel_x = module.get_function(symbol_names[0]) + transposed_convolution_kernel_x = module.get_function( + "transposed_convolution_x" + ) transposed_convolution_kernel_x( grid_dim, block_dim, - (dim_x, dim_y, dim_z, x, in_dim_x, in_stride_y, in_stride_z, w, out), + ( + dim_x, + dim_y, + dim_z, + x, + in_dim_x, + in_stride_y, + in_stride_z, + w, + wk, + out, + ), ) elif hk > 1: - transposed_convolution_kernel_y = module.get_function(symbol_names[1]) + transposed_convolution_kernel_y = module.get_function( + "transposed_convolution_y" + ) transposed_convolution_kernel_y( grid_dim, block_dim, - (dim_x, dim_y, dim_z, x, in_dim_y, in_stride_y, in_stride_z, w, out), + ( + dim_x, + dim_y, + dim_z, + x, + in_dim_y, + in_stride_y, + in_stride_z, + w, + hk, + out, + ), ) else: assert False @@ -1001,7 +1024,7 @@ def raven_filter( height, images, width = data.shape # Set the input type of the kernel - kernel_args = "raven_filter<{0}>".format( + kernel_name = "raven_filter_{0}".format( "float" if calc_type == "complex64" else "double" ) @@ -1014,8 +1037,8 @@ def raven_filter( grid_dims = (grid_x, grid_y, grid_z) params = (fft_data_shifted, fft_data, width, images, height, uvalue, nvalue, vvalue) - raven_module = load_cuda_module("raven_filter", name_expressions=[kernel_args]) - raven_filt = raven_module.get_function(kernel_args) + raven_module = load_cuda_module("raven_filter") + raven_filt = raven_module.get_function(kernel_name) raven_filt(grid_dims, block_dims, params) del fft_data_shifted diff --git a/httomolibgpu/recon/rotation.py b/httomolibgpu/recon/rotation.py index c3aa47d2..41176b55 100644 --- a/httomolibgpu/recon/rotation.py +++ b/httomolibgpu/recon/rotation.py @@ -666,12 +666,6 @@ def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm): _calc_metrics_module = load_cuda_module( "calc_metrics", - name_expressions=[ - "calc_metrics_kernel", - "calc_metrics_kernel", - "calc_metrics_kernel", - "calc_metrics_kernel", - ], options=("--maxrregcount=32",), ) @@ -691,10 +685,12 @@ def _calc_metrics(mat1, mat2, win_width, side, use_overlap, norm): block = (128, 1, 1) grid = (1, np.int32(num_pos), 1) smem = block[0] * 4 * 6 if use_overlap else block[0] * 4 * 3 - bool2str = lambda x: "true" if x is True else "false" - calc_metrics = _calc_metrics_module.get_function( - f"calc_metrics_kernel<{bool2str(norm)}, {bool2str(use_overlap)}>" - ) + calc_metrics_kernel_name = "calc_metrics_kernel" + if norm: + calc_metrics_kernel_name += "_norm" + if use_overlap: + calc_metrics_kernel_name += "_use_overlap" + calc_metrics = _calc_metrics_module.get_function(calc_metrics_kernel_name) calc_metrics(grid=grid, block=block, args=args, shared_mem=smem) return list_metric