Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions httomolibgpu/cuda_kernels/calc_metrics.cu
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ __device__ void _calc_metrics_overlap(const float *mat1, int mat1_nx,
* the compiler (rather than at runtime), which reduces the register count.
*/
template <bool norm, bool use_overlap>
__global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
__device__ __forceinline__ void calc_metrics_kernel_impl(const float *mat1, int mat1_nx,
const float *mat2, int mat2_nx,
int win_width, int rows, int side,
float *list_metric)
Expand All @@ -331,4 +331,28 @@ __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx,
} else {
_calc_metrics_no_overlap<norm>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
}
}
}

extern "C" __global__ void calc_metrics_kernel(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
int win_width, int rows, int side, float *list_metric)
{
calc_metrics_kernel_impl<false, false>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
}

extern "C" __global__ void calc_metrics_kernel_norm(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
int win_width, int rows, int side, float *list_metric)
{
calc_metrics_kernel_impl<true, false>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
}

extern "C" __global__ void calc_metrics_kernel_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
int win_width, int rows, int side, float *list_metric)
{
calc_metrics_kernel_impl<false, true>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
}

extern "C" __global__ void calc_metrics_kernel_norm_use_overlap(const float *mat1, int mat1_nx, const float *mat2, int mat2_nx,
int win_width, int rows, int side, float *list_metric)
{
calc_metrics_kernel_impl<true, true>(mat1, mat1_nx, mat2, mat2_nx, win_width, rows, side, list_metric);
}
64 changes: 62 additions & 2 deletions httomolibgpu/cuda_kernels/median_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
template <typename Type, int diameter>
__global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
int Z, int M, int N) {
__device__ __forceinline__ void median_general_kernel3d_impl(const Type *in, Type *out, float dif,
int Z, int M, int N) {
constexpr int radius = diameter / 2;
constexpr int d3 = diameter * diameter * diameter;
constexpr int midpoint = d3 / 2;
Expand Down Expand Up @@ -52,3 +52,63 @@ __global__ void median_general_kernel3d(const Type *in, Type *out, float dif,
}
else out[index] = ValVec[midpoint]; /* median filtering */
}

extern "C" __global__ void median_general_kernel3d_float_3(const float *in, float *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<float, 3>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_float_5(const float *in, float *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<float, 5>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_float_7(const float *in, float *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<float, 7>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_float_9(const float *in, float *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<float, 9>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_float_11(const float *in, float *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<float, 11>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_float_13(const float *in, float *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<float, 13>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_unsigned_short_3(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<unsigned short, 3>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_unsigned_short_5(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<unsigned short, 5>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_unsigned_short_7(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<unsigned short, 7>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_unsigned_short_9(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<unsigned short, 9>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_unsigned_short_11(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<unsigned short, 11>(in, out, dif, Z, M, N);
}

extern "C" __global__ void median_general_kernel3d_unsigned_short_13(const unsigned short *in, unsigned short *out, float dif, int Z, int M, int N)
{
median_general_kernel3d_impl<unsigned short, 13>(in, out, dif, Z, M, N);
}
20 changes: 18 additions & 2 deletions httomolibgpu/cuda_kernels/raven_filter.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include <cupy/complex.cuh>

template <typename Type>
__global__ void
raven_filter(
__device__ __forceinline__ void
raven_filter_impl(
complex<Type> *input,
complex<Type> *output,
int width, int images, int height,
Expand Down Expand Up @@ -47,3 +47,19 @@ raven_filter(

output[outIndex] = value;
}

extern "C" __global__ void raven_filter_float(complex<float> *input,
complex<float> *output,
int width, int images, int height,
int u0, int n, int v0)
{
raven_filter_impl<float>(input, output, width, images, height, u0, n, v0);
}

extern "C" __global__ void raven_filter_double(complex<double> *input,
complex<double> *output,
int width, int images, int height,
int u0, int n, int v0)
{
raven_filter_impl<double>(input, output, width, images, height, u0, n, v0);
}
14 changes: 12 additions & 2 deletions httomolibgpu/cuda_kernels/remove_nan_inf.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
template <typename Type>
__global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) {
__device__ __forceinline__ void remove_nan_inf_impl(Type *data, int Z, int M, int N, int *result) {
const long i = blockDim.x * blockIdx.x + threadIdx.x;
const long j = blockDim.y * blockIdx.y + threadIdx.y;
const long k = blockDim.z * blockIdx.z + threadIdx.z;
Expand All @@ -16,4 +16,14 @@ __global__ void remove_nan_inf(Type *data, int Z, int M, int N, int *result) {
data[index] = zero;
}

}
}

extern "C" __global__ void remove_nan_inf_float(float *data, int Z, int M, int N, int *result)
{
remove_nan_inf_impl<float>(data, Z, M, N, result);
}

extern "C" __global__ void remove_nan_inf_unsigned_short(unsigned short *data, int Z, int M, int N, int *result)
{
remove_nan_inf_impl<unsigned short>(data, Z, M, N, result);
}
32 changes: 16 additions & 16 deletions httomolibgpu/cuda_kernels/remove_stripe_fw.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
template<int WSize>
__global__ void grouped_convolution_x(
extern "C" __global__ void grouped_convolution_x(
int dim_x,
int dim_y,
int dim_z,
Expand All @@ -10,7 +9,8 @@ __global__ void grouped_convolution_x(
float* out,
int out_stride_z,
int out_stride_group,
const float* w
const float* w,
int wk
)
{
const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -25,9 +25,9 @@ __global__ void grouped_convolution_x(
for (int i = 0; i < out_groups; ++i)
{
float acc = 0.F;
for (int j = 0; j < WSize; ++j)
for (int j = 0; j < wk; ++j)
{
const int w_idx = i * WSize + j;
const int w_idx = i * wk + j;
const int in_idx = (g_thd_x * in_stride_x + j) + g_thd_y * in_stride_y + g_thd_z * in_stride_z;
acc += w[w_idx] * in[in_idx];
}
Expand All @@ -36,8 +36,7 @@ __global__ void grouped_convolution_x(
}
}

template<int WSize>
__global__ void grouped_convolution_y(
extern "C" __global__ void grouped_convolution_y(
int dim_x,
int dim_y,
int dim_z,
Expand All @@ -49,7 +48,8 @@ __global__ void grouped_convolution_y(
float* out,
int out_stride_z,
int out_stride_group,
const float* w
const float* w,
int hk
)
{
const int g_thd_x = blockDim.x * blockIdx.x + threadIdx.x;
Expand All @@ -68,9 +68,9 @@ __global__ void grouped_convolution_y(
for (int i = 0; i < out_groups; ++i)
{
float acc = 0.F;
for (int j = 0; j < WSize; ++j)
for (int j = 0; j < hk; ++j)
{
const int w_idx = (out_groups * group + i) * WSize + j;
const int w_idx = (out_groups * group + i) * hk + j;
const int in_idx = g_thd_x * in_stride_x + (item_stride_y * g_thd_y + j) * in_stride_y + group * in_stride_group + g_thd_z * in_stride_z;
acc += w[w_idx] * in[in_idx];
}
Expand All @@ -80,8 +80,7 @@ __global__ void grouped_convolution_y(
}
}

template<int WSize>
__global__ void transposed_convolution_x(
extern "C" __global__ void transposed_convolution_x(
int dim_x,
int dim_y,
int dim_z,
Expand All @@ -90,6 +89,7 @@ __global__ void transposed_convolution_x(
int in_stride_y,
int in_stride_z,
const float* w,
int wk,
float* out
)
{
Expand All @@ -103,7 +103,7 @@ __global__ void transposed_convolution_x(

constexpr int item_out_stride = 2;
float acc = 0.F;
for (int i = 0; i < WSize; ++i)
for (int i = 0; i < wk; ++i)
{
const int in_x = (g_thd_x - i) / item_out_stride;
const int in_x_mod = (g_thd_x - i) % item_out_stride;
Expand All @@ -117,8 +117,7 @@ __global__ void transposed_convolution_x(
out[out_idx] = acc;
}

template<int WSize>
__global__ void transposed_convolution_y(
extern "C" __global__ void transposed_convolution_y(
int dim_x,
int dim_y,
int dim_z,
Expand All @@ -127,6 +126,7 @@ __global__ void transposed_convolution_y(
int in_stride_y,
int in_stride_z,
const float* w,
int hk,
float* out
)
{
Expand All @@ -140,7 +140,7 @@ __global__ void transposed_convolution_y(

constexpr int item_out_stride = 2;
float acc = 0.F;
for (int i = 0; i < WSize; ++i)
for (int i = 0; i < hk; ++i)
{
const int in_y = (g_thd_y - i) / item_out_stride;
const int in_y_mod = (g_thd_y - i) % item_out_stride;
Expand Down
8 changes: 4 additions & 4 deletions httomolibgpu/misc/corr.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ def median_filter(
output = cp.copy(data, order="C")

# 3d median or dezinger
kernel_args = "median_general_kernel3d<{0}, {1}>".format(
"float" if input_type == "float32" else "unsigned short", kernel_size
kernel_name = "median_general_kernel3d_{0}_{1}".format(
"float" if input_type == "float32" else "unsigned_short", kernel_size
)
block_x = 128
# setting grid/block parameters
Expand All @@ -99,8 +99,8 @@ def median_filter(
grid_dims = (grid_x, grid_y, grid_z)
params = (data, output, cp.float32(dif), dz, dy, dx)

median_module = load_cuda_module("median_kernel", name_expressions=[kernel_args])
median_filt = median_module.get_function(kernel_args)
median_module = load_cuda_module("median_kernel")
median_filt = median_module.get_function(kernel_name)

median_filt(grid_dims, block_dims, params)

Expand Down
8 changes: 4 additions & 4 deletions httomolibgpu/misc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,12 @@ def __naninfs_check(
grid_dims = (grid_x, grid_y, grid_z)
params = (data, dz, dy, dx, present_nans_infs)

kernel_args = "remove_nan_inf<{0}>".format(
"float" if input_type == "float32" else "unsigned short"
kernel_name = "remove_nan_inf_{0}".format(
"float" if input_type == "float32" else "unsigned_short"
)

module = load_cuda_module("remove_nan_inf", name_expressions=[kernel_args])
remove_nan_inf_kernel = module.get_function(kernel_args)
module = load_cuda_module("remove_nan_inf")
remove_nan_inf_kernel = module.get_function(kernel_name)
remove_nan_inf_kernel(grid_dims, block_dims, params)

if present_nans_infs[0].get() == 1:
Expand Down
Loading
Loading