Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 40 additions & 26 deletions source/source_lcao/module_gint/kernel/phi_operator_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include "phi_operator_kernel.cuh"
#include "dgemm_vbatch.h"
#include <cuda_runtime.h>
#include <type_traits>
#include "source_base/module_device/device_check.h"

namespace ModuleGint
Expand Down Expand Up @@ -124,30 +125,41 @@ void PhiOperatorGpu<Real>::set_phi_dphi(double* phi_d, double* dphi_x_d, double*
{
dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size());
dim3 threads_per_block(64);
set_phi_dphi_kernel<<<grid_dim, threads_per_block, 0, stream_>>>(
gint_gpu_vars_->nwmax,
mgrids_num_,
gint_gpu_vars_->nr_max,
gint_gpu_vars_->dr_uniform,
gint_gpu_vars_->ucell_atom_nwl_d,
gint_gpu_vars_->atom_iw2_new_d,
gint_gpu_vars_->atom_iw2_ylm_d,
gint_gpu_vars_->atom_iw2_l_d,
gint_gpu_vars_->atom_nw_d,
gint_gpu_vars_->iat2it_d,
gint_gpu_vars_->rcut_d,
gint_gpu_vars_->psi_u_d,
gint_gpu_vars_->dpsi_u_d,
gint_gpu_vars_->mgrids_pos_d,
atoms_iat_.get_device_ptr(),
atoms_bgrids_rcoords_.get_device_ptr(),
atoms_num_info_.get_device_ptr(),
atom_phi_start_.get_device_ptr(),
bgrid_phi_len_.get_device_ptr(),
phi_d,
dphi_x_d,
dphi_y_d,
dphi_z_d);
// Dispatch the WantPhi template based on whether phi is requested.
// Lets the compiler drop the phi[] stores entirely in the dphi-only case
// (gint_tau) without paying a per-iw `phi != nullptr` branch in the loop.
auto launch = [&](auto want_phi) {
constexpr bool WantPhi = decltype(want_phi)::value;
set_phi_dphi_kernel<WantPhi><<<grid_dim, threads_per_block, 0, stream_>>>(
gint_gpu_vars_->nwmax,
mgrids_num_,
gint_gpu_vars_->nr_max,
gint_gpu_vars_->dr_uniform,
gint_gpu_vars_->ucell_atom_nwl_d,
gint_gpu_vars_->atom_iw2_new_d,
gint_gpu_vars_->atom_iw2_ylm_d,
gint_gpu_vars_->atom_iw2_l_d,
gint_gpu_vars_->atom_nw_d,
gint_gpu_vars_->iat2it_d,
gint_gpu_vars_->rcut_d,
gint_gpu_vars_->psi_u_d,
gint_gpu_vars_->dpsi_u_d,
gint_gpu_vars_->mgrids_pos_d,
atoms_iat_.get_device_ptr(),
atoms_bgrids_rcoords_.get_device_ptr(),
atoms_num_info_.get_device_ptr(),
atom_phi_start_.get_device_ptr(),
bgrid_phi_len_.get_device_ptr(),
phi_d,
dphi_x_d,
dphi_y_d,
dphi_z_d);
};
if (phi_d != nullptr) {
launch(std::true_type{});
} else {
launch(std::false_type{});
}
CHECK_LAST_CUDA_ERROR("kernel launch");
}

Expand Down Expand Up @@ -427,8 +439,9 @@ void PhiOperatorGpu<Real>::phi_dot_dphi(
{
dim3 grid_dim(bgrid_batch_->get_max_atoms_per_bgrid(),
bgrid_batch_->get_batch_size());
// Kernel reduce is single-warp; blockDim.x MUST stay 32.
dim3 threads_per_block(32);
phi_dot_dphi_kernel<<<grid_dim, threads_per_block, sizeof(double) * 32 * 3, stream_>>>(
phi_dot_dphi_kernel<<<grid_dim, threads_per_block, 0, stream_>>>(
phi_d,
dphi_x_d,
dphi_y_d,
Expand All @@ -454,8 +467,9 @@ void PhiOperatorGpu<Real>::phi_dot_dphi_r(
{
dim3 grid_dim(mgrids_num_,
bgrid_batch_->get_batch_size());
// Kernel reduce is single-warp; blockDim.x MUST stay 32.
dim3 threads_per_block(32);
phi_dot_dphi_r_kernel<<<grid_dim, threads_per_block, sizeof(double) * 32 * 6, stream_>>>(
phi_dot_dphi_r_kernel<<<grid_dim, threads_per_block, 0, stream_>>>(
phi_d,
dphi_x_d,
dphi_y_d,
Expand Down
136 changes: 61 additions & 75 deletions source/source_lcao/module_gint/kernel/phi_operator_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ template __global__ void set_phi_kernel<float>(
const int*, const double3*, const int2*, const int*, const int*,
float*);

template<bool WantPhi>
__global__ void set_phi_dphi_kernel(
const int nwmax,
const int mgrids_num,
Expand Down Expand Up @@ -156,6 +157,7 @@ __global__ void set_phi_dphi_kernel(
grad_rl_sph_harm(nwl, coord.x, coord.y, coord.z, rly, grly);

// interpolation
const double inv_dist = 1.0 / dist; // hoisted: re-used by every iw below
const double pos = dist / dr_uniform;
const int ip = static_cast<int>(pos);
const double x0 = pos - ip;
Expand All @@ -182,15 +184,17 @@ __global__ void set_phi_dphi_kernel(
const int iw_l = atom_iw2_l[it_nw + iw];
const int idx_ylm = atom_iw2_ylm [it_nw + iw];
const double rl = pow_int(dist, iw_l);
const double tmprl = tmp / rl;
const double inv_rl = 1.0 / rl;
const double tmprl = tmp * inv_rl;

// if phi == nullptr, it means that we only need dphi.
if(phi != nullptr)
if (WantPhi)
{
phi[phi_idx + iw] = tmprl * rly[idx_ylm];
}
// derivative of wave functions with respect to atom positions.
const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist;
// (dtmp - tmp*iw_l/dist) / rl * rly / dist == (dtmp*inv_dist - tmp*iw_l*inv_dist^2) * inv_rl * rly
const double tmpdphi_rly = (dtmp * inv_dist - tmp * iw_l * inv_dist * inv_dist)
* inv_rl * rly[idx_ylm];

dphi_x[phi_idx + iw] = tmpdphi_rly * coord.x + tmprl * grly[idx_ylm * 3 + 0];
dphi_y[phi_idx + iw] = tmpdphi_rly * coord.y + tmprl * grly[idx_ylm * 3 + 1];
Expand All @@ -203,7 +207,7 @@ __global__ void set_phi_dphi_kernel(
bgrid_phi_len[bgrid_id] * mgrid_id;
for (int iw = 0; iw < atom_nw[atom_type]; iw++)
{
if(phi != nullptr)
if (WantPhi)
{
phi[phi_idx + iw] = 0.0;
}
Expand All @@ -215,6 +219,20 @@ __global__ void set_phi_dphi_kernel(
}
}

// Explicit instantiations for set_phi_dphi_kernel
template __global__ void set_phi_dphi_kernel<true>(
const int, const int, const int, const double,
const int*, const bool*, const int*, const int*, const int*, const int*,
const double*, const double*, const double*, const double3*,
const int*, const double3*, const int2*, const int*, const int*,
double*, double*, double*, double*);
template __global__ void set_phi_dphi_kernel<false>(
const int, const int, const int, const double,
const int*, const bool*, const int*, const int*, const int*, const int*,
const double*, const double*, const double*, const double3*,
const int*, const double3*, const int2*, const int*, const int*,
double*, double*, double*, double*);

// The code for `set_ddphi_kernel` is quite difficult to understand.
// To grasp it, you better refer to the CPU function `set_ddphi`
__global__ void set_ddphi_kernel(
Expand Down Expand Up @@ -264,7 +282,8 @@ __global__ void set_ddphi_kernel(
bgrid_phi_len[bgrid_id] * mgrid_id;
for(int i = 0; i < 6; i++)
{
coord[i/2] += std::pow(-1, i%2) * 0.0001;
const double eps = (i & 1) ? -0.0001 : 0.0001;
coord[i/2] += eps;
double dist = norm3d(coord[0], coord[1], coord[2]);
if (dist < 1.0E-9)
{ dist += 1.0E-9; }
Expand All @@ -276,6 +295,7 @@ __global__ void set_ddphi_kernel(
grad_rl_sph_harm(nwl, coord[0], coord[1], coord[2], rly, grly);

// interpolation
const double inv_dist = 1.0 / dist; // hoisted: re-used by every iw
const double pos = dist / dr_uniform;
const int ip = static_cast<int>(pos);
const double x0 = pos - ip;
Expand All @@ -300,8 +320,10 @@ __global__ void set_ddphi_kernel(
const int iw_l = atom_iw2_l[it_nw + iw];
const int idx_ylm = atom_iw2_ylm [it_nw + iw];
const double rl = pow_int(dist, iw_l);
const double tmprl = tmp / rl;
const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist;
const double inv_rl = 1.0 / rl;
const double tmprl = tmp * inv_rl;
const double tmpdphi_rly = (dtmp * inv_dist - tmp * iw_l * inv_dist * inv_dist)
* inv_rl * rly[idx_ylm];

double dphi[3];
dphi[0] = tmpdphi_rly * coord[0] + tmprl * grly[idx_ylm * 3 + 0];
Expand Down Expand Up @@ -340,7 +362,7 @@ __global__ void set_ddphi_kernel(
ddphi_zz[phi_idx + iw] -= dphi[2];
}
}
coord[i/2] -= std::pow(-1, i%2) * 0.0001; // recover coord
coord[i/2] -= eps; // recover coord
}

for (int iw = 0; iw < atom_nw[atom_type]; iw++)
Expand Down Expand Up @@ -460,14 +482,14 @@ __global__ void phi_dot_dphi_kernel(
const int* __restrict__ atom_nw,
double* force)
{
__shared__ double s_data[32 * 3]; // the length of s_data equals the max warp num of a block times 3
// NOTE: this kernel assumes blockDim.x == 32 (a single warp). If the launch
// configuration is ever changed, the reduce below needs a shared-memory stage.
const int bgrid_id = blockIdx.y;
const int atoms_num = atoms_num_info[bgrid_id].x;
const int pre_atoms_num = atoms_num_info[bgrid_id].y;
const int b_phi_len = bgrid_phi_len[bgrid_id];
const int tid = threadIdx.x;
const int warp_id = tid / 32;
const int lane_id = tid % 32;
const int lane_id = tid; // blockDim.x == 32

for (int atom_id = blockIdx.x; atom_id < atoms_num; atom_id += gridDim.x)
{
Expand All @@ -481,44 +503,23 @@ __global__ void phi_dot_dphi_kernel(
for (int iw = tid; iw < nw; iw += blockDim.x)
{
int phi_idx = phi_start + iw;
f[0] += phi[phi_idx] * dphi_x[phi_idx];
f[1] += phi[phi_idx] * dphi_y[phi_idx];
f[2] += phi[phi_idx] * dphi_z[phi_idx];
const double p = phi[phi_idx];
f[0] += p * dphi_x[phi_idx];
f[1] += p * dphi_y[phi_idx];
f[2] += p * dphi_z[phi_idx];
}
}

// reduce the force in each block
for (int i = 0; i < 3; i++)
{
f[i] = warpReduceSum(f[i]);
}
// single-warp reduce
f[0] = warpReduceSum(f[0]);
f[1] = warpReduceSum(f[1]);
f[2] = warpReduceSum(f[2]);

if (lane_id == 0)
{
for (int i = 0; i < 3; i++)
{
s_data[warp_id * 3 + i] = f[i];
}
}
__syncthreads();

for (int i = 0; i < 3; i++)
{
f[i] = (tid < blockDim.x / 32) ? s_data[tid * 3 + i] : 0;
}
if (warp_id == 0)
{
for (int i = 0; i < 3; i++)
{
f[i] = warpReduceSum(f[i]);
}
}
if (tid == 0)
{
for (int i = 0; i < 3; i++)
{
atomicAdd(&force[iat * 3 + i], f[i] * 2);
}
atomicAdd(&force[iat * 3 + 0], f[0] * 2);
atomicAdd(&force[iat * 3 + 1], f[1] * 2);
atomicAdd(&force[iat * 3 + 2], f[2] * 2);
}
}
}
Expand All @@ -539,14 +540,14 @@ __global__ void phi_dot_dphi_r_kernel(
const int* __restrict__ atom_nw,
double* __restrict__ svl)
{
__shared__ double s_data[32 * 6]; // the length of s_data equals the max warp num of a block times 6
// NOTE: this kernel assumes blockDim.x == 32 (a single warp). If the launch
// configuration is ever changed, the reduce below needs a shared-memory stage.
const int tid = threadIdx.x;
const int bgrid_id = blockIdx.y;
const int atoms_num = atoms_num_info[bgrid_id].x;
const int pre_atoms_num = atoms_num_info[bgrid_id].y;
const int b_phi_len = bgrid_phi_len[bgrid_id];
const int warp_id = tid / 32;
const int lane_id = tid % 32;
const int lane_id = tid; // blockDim.x == 32

double stress[6]{0.0};
for (int mgrid_id = blockIdx.x; mgrid_id < mgrids_per_bgrid; mgrid_id += gridDim.x)
Expand All @@ -564,44 +565,29 @@ __global__ void phi_dot_dphi_r_kernel(
for (int iw = tid; iw < nw; iw += blockDim.x)
{
int phi_idx = phi_start + iw;
stress[0] += phi[phi_idx] * dphi_x[phi_idx] * coord.x;
stress[1] += phi[phi_idx] * dphi_x[phi_idx] * coord.y;
stress[2] += phi[phi_idx] * dphi_x[phi_idx] * coord.z;
stress[3] += phi[phi_idx] * dphi_y[phi_idx] * coord.y;
stress[4] += phi[phi_idx] * dphi_y[phi_idx] * coord.z;
stress[5] += phi[phi_idx] * dphi_z[phi_idx] * coord.z;
const double p = phi[phi_idx];
const double pdx = p * dphi_x[phi_idx];
const double pdy = p * dphi_y[phi_idx];
const double pdz = p * dphi_z[phi_idx];
stress[0] += pdx * coord.x;
stress[1] += pdx * coord.y;
stress[2] += pdx * coord.z;
stress[3] += pdy * coord.y;
stress[4] += pdy * coord.z;
stress[5] += pdz * coord.z;
}
}
}

// reduce the stress in each block
// single-warp reduce
#pragma unroll
for (int i = 0; i < 6; i++)
{
stress[i] = warpReduceSum(stress[i]);
}

if (lane_id == 0)
{
for (int i = 0; i < 6; i++)
{
s_data[warp_id * 6 + i] = stress[i];
}
}
__syncthreads();

for (int i = 0; i < 6; i++)
{
stress[i] = (tid < blockDim.x / 32) ? s_data[tid * 6 + i] : 0;
}
if (warp_id == 0)
{
for (int i = 0; i < 6; i++)
{
stress[i] = warpReduceSum(stress[i]);
}
}
if (tid == 0)
{
#pragma unroll
for (int i = 0; i < 6; i++)
{
atomicAdd(&svl[i], stress[i] * 2);
Expand Down
2 changes: 2 additions & 0 deletions source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ __global__ void set_phi_kernel(
const int* __restrict__ bgrid_phi_len,
Real* __restrict__ phi);

// WantPhi == false: skip phi[] writes entirely (callers like gint_tau pass nullptr).
template<bool WantPhi>
__global__ void set_phi_dphi_kernel(
const int nwmax,
const int mgrids_num,
Expand Down
Loading