diff --git a/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu index 29e27a852a..96735414d4 100644 --- a/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu +++ b/source/source_lcao/module_gint/kernel/phi_operator_gpu.cu @@ -2,6 +2,7 @@ #include "phi_operator_kernel.cuh" #include "dgemm_vbatch.h" #include +#include #include "source_base/module_device/device_check.h" namespace ModuleGint @@ -124,30 +125,41 @@ void PhiOperatorGpu::set_phi_dphi(double* phi_d, double* dphi_x_d, double* { dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); dim3 threads_per_block(64); - set_phi_dphi_kernel<<>>( - gint_gpu_vars_->nwmax, - mgrids_num_, - gint_gpu_vars_->nr_max, - gint_gpu_vars_->dr_uniform, - gint_gpu_vars_->ucell_atom_nwl_d, - gint_gpu_vars_->atom_iw2_new_d, - gint_gpu_vars_->atom_iw2_ylm_d, - gint_gpu_vars_->atom_iw2_l_d, - gint_gpu_vars_->atom_nw_d, - gint_gpu_vars_->iat2it_d, - gint_gpu_vars_->rcut_d, - gint_gpu_vars_->psi_u_d, - gint_gpu_vars_->dpsi_u_d, - gint_gpu_vars_->mgrids_pos_d, - atoms_iat_.get_device_ptr(), - atoms_bgrids_rcoords_.get_device_ptr(), - atoms_num_info_.get_device_ptr(), - atom_phi_start_.get_device_ptr(), - bgrid_phi_len_.get_device_ptr(), - phi_d, - dphi_x_d, - dphi_y_d, - dphi_z_d); + // Dispatch the WantPhi template based on whether phi is requested. + // Lets the compiler drop the phi[] stores entirely in the dphi-only case + // (gint_tau) without paying a per-iw `phi != nullptr` branch in the loop. + auto launch = [&](auto want_phi) { + constexpr bool WantPhi = decltype(want_phi)::value; + set_phi_dphi_kernel<<>>( + gint_gpu_vars_->nwmax, + mgrids_num_, + gint_gpu_vars_->nr_max, + gint_gpu_vars_->dr_uniform, + gint_gpu_vars_->ucell_atom_nwl_d, + gint_gpu_vars_->atom_iw2_new_d, + gint_gpu_vars_->atom_iw2_ylm_d, + gint_gpu_vars_->atom_iw2_l_d, + gint_gpu_vars_->atom_nw_d, + gint_gpu_vars_->iat2it_d, + gint_gpu_vars_->rcut_d, + gint_gpu_vars_->psi_u_d, + gint_gpu_vars_->dpsi_u_d, + gint_gpu_vars_->mgrids_pos_d, + atoms_iat_.get_device_ptr(), + atoms_bgrids_rcoords_.get_device_ptr(), + atoms_num_info_.get_device_ptr(), + atom_phi_start_.get_device_ptr(), + bgrid_phi_len_.get_device_ptr(), + phi_d, + dphi_x_d, + dphi_y_d, + dphi_z_d); + }; + if (phi_d != nullptr) { + launch(std::true_type{}); + } else { + launch(std::false_type{}); + } CHECK_LAST_CUDA_ERROR("kernel launch"); } @@ -427,8 +439,9 @@ void PhiOperatorGpu::phi_dot_dphi( { dim3 grid_dim(bgrid_batch_->get_max_atoms_per_bgrid(), bgrid_batch_->get_batch_size()); + // Kernel reduce is single-warp; blockDim.x MUST stay 32. dim3 threads_per_block(32); - phi_dot_dphi_kernel<<>>( + phi_dot_dphi_kernel<<>>( phi_d, dphi_x_d, dphi_y_d, @@ -454,8 +467,9 @@ void PhiOperatorGpu::phi_dot_dphi_r( { dim3 grid_dim(mgrids_num_, bgrid_batch_->get_batch_size()); + // Kernel reduce is single-warp; blockDim.x MUST stay 32. dim3 threads_per_block(32); - phi_dot_dphi_r_kernel<<>>( + phi_dot_dphi_r_kernel<<>>( phi_d, dphi_x_d, dphi_y_d, diff --git a/source/source_lcao/module_gint/kernel/phi_operator_kernel.cu b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cu index aa2dcc9fb7..576e8bfa72 100644 --- a/source/source_lcao/module_gint/kernel/phi_operator_kernel.cu +++ b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cu @@ -105,6 +105,7 @@ template __global__ void set_phi_kernel( const int*, const double3*, const int2*, const int*, const int*, float*); +template __global__ void set_phi_dphi_kernel( const int nwmax, const int mgrids_num, @@ -156,6 +157,7 @@ __global__ void set_phi_dphi_kernel( grad_rl_sph_harm(nwl, coord.x, coord.y, coord.z, rly, grly); // interpolation + const double inv_dist = 1.0 / dist; // hoisted: re-used by every iw below const double pos = dist / dr_uniform; const int ip = static_cast(pos); const double x0 = pos - ip; @@ -182,15 +184,17 @@ __global__ void set_phi_dphi_kernel( const int iw_l = atom_iw2_l[it_nw + iw]; const int idx_ylm = atom_iw2_ylm [it_nw + iw]; const double rl = pow_int(dist, iw_l); - const double tmprl = tmp / rl; + const double inv_rl = 1.0 / rl; + const double tmprl = tmp * inv_rl; - // if phi == nullptr, it means that we only need dphi. - if(phi != nullptr) + if (WantPhi) { phi[phi_idx + iw] = tmprl * rly[idx_ylm]; } // derivative of wave functions with respect to atom positions. - const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist; + // (dtmp - tmp*iw_l/dist) / rl * rly / dist == (dtmp*inv_dist - tmp*iw_l*inv_dist^2) * inv_rl * rly + const double tmpdphi_rly = (dtmp * inv_dist - tmp * iw_l * inv_dist * inv_dist) + * inv_rl * rly[idx_ylm]; dphi_x[phi_idx + iw] = tmpdphi_rly * coord.x + tmprl * grly[idx_ylm * 3 + 0]; dphi_y[phi_idx + iw] = tmpdphi_rly * coord.y + tmprl * grly[idx_ylm * 3 + 1]; @@ -203,7 +207,7 @@ __global__ void set_phi_dphi_kernel( bgrid_phi_len[bgrid_id] * mgrid_id; for (int iw = 0; iw < atom_nw[atom_type]; iw++) { - if(phi != nullptr) + if (WantPhi) { phi[phi_idx + iw] = 0.0; } @@ -215,6 +219,20 @@ __global__ void set_phi_dphi_kernel( } } +// Explicit instantiations for set_phi_dphi_kernel +template __global__ void set_phi_dphi_kernel( + const int, const int, const int, const double, + const int*, const bool*, const int*, const int*, const int*, const int*, + const double*, const double*, const double*, const double3*, + const int*, const double3*, const int2*, const int*, const int*, + double*, double*, double*, double*); +template __global__ void set_phi_dphi_kernel( + const int, const int, const int, const double, + const int*, const bool*, const int*, const int*, const int*, const int*, + const double*, const double*, const double*, const double3*, + const int*, const double3*, const int2*, const int*, const int*, + double*, double*, double*, double*); + // The code for `set_ddphi_kernel` is quite difficult to understand. // To grasp it, you better refer to the CPU function `set_ddphi` __global__ void set_ddphi_kernel( @@ -264,7 +282,8 @@ __global__ void set_ddphi_kernel( bgrid_phi_len[bgrid_id] * mgrid_id; for(int i = 0; i < 6; i++) { - coord[i/2] += std::pow(-1, i%2) * 0.0001; + const double eps = (i & 1) ? -0.0001 : 0.0001; + coord[i/2] += eps; double dist = norm3d(coord[0], coord[1], coord[2]); if (dist < 1.0E-9) { dist += 1.0E-9; } @@ -276,6 +295,7 @@ __global__ void set_ddphi_kernel( grad_rl_sph_harm(nwl, coord[0], coord[1], coord[2], rly, grly); // interpolation + const double inv_dist = 1.0 / dist; // hoisted: re-used by every iw const double pos = dist / dr_uniform; const int ip = static_cast(pos); const double x0 = pos - ip; @@ -300,8 +320,10 @@ __global__ void set_ddphi_kernel( const int iw_l = atom_iw2_l[it_nw + iw]; const int idx_ylm = atom_iw2_ylm [it_nw + iw]; const double rl = pow_int(dist, iw_l); - const double tmprl = tmp / rl; - const double tmpdphi_rly = (dtmp - tmp * iw_l / dist) / rl * rly[idx_ylm] / dist; + const double inv_rl = 1.0 / rl; + const double tmprl = tmp * inv_rl; + const double tmpdphi_rly = (dtmp * inv_dist - tmp * iw_l * inv_dist * inv_dist) + * inv_rl * rly[idx_ylm]; double dphi[3]; dphi[0] = tmpdphi_rly * coord[0] + tmprl * grly[idx_ylm * 3 + 0]; @@ -340,7 +362,7 @@ __global__ void set_ddphi_kernel( ddphi_zz[phi_idx + iw] -= dphi[2]; } } - coord[i/2] -= std::pow(-1, i%2) * 0.0001; // recover coord + coord[i/2] -= eps; // recover coord } for (int iw = 0; iw < atom_nw[atom_type]; iw++) @@ -460,14 +482,14 @@ __global__ void phi_dot_dphi_kernel( const int* __restrict__ atom_nw, double* force) { - __shared__ double s_data[32 * 3]; // the length of s_data equals the max warp num of a block times 3 + // NOTE: this kernel assumes blockDim.x == 32 (a single warp). If the launch + // configuration is ever changed, the reduce below needs a shared-memory stage. const int bgrid_id = blockIdx.y; const int atoms_num = atoms_num_info[bgrid_id].x; const int pre_atoms_num = atoms_num_info[bgrid_id].y; const int b_phi_len = bgrid_phi_len[bgrid_id]; const int tid = threadIdx.x; - const int warp_id = tid / 32; - const int lane_id = tid % 32; + const int lane_id = tid; // blockDim.x == 32 for (int atom_id = blockIdx.x; atom_id < atoms_num; atom_id += gridDim.x) { @@ -481,44 +503,23 @@ __global__ void phi_dot_dphi_kernel( for (int iw = tid; iw < nw; iw += blockDim.x) { int phi_idx = phi_start + iw; - f[0] += phi[phi_idx] * dphi_x[phi_idx]; - f[1] += phi[phi_idx] * dphi_y[phi_idx]; - f[2] += phi[phi_idx] * dphi_z[phi_idx]; + const double p = phi[phi_idx]; + f[0] += p * dphi_x[phi_idx]; + f[1] += p * dphi_y[phi_idx]; + f[2] += p * dphi_z[phi_idx]; } } - // reduce the force in each block - for (int i = 0; i < 3; i++) - { - f[i] = warpReduceSum(f[i]); - } + // single-warp reduce + f[0] = warpReduceSum(f[0]); + f[1] = warpReduceSum(f[1]); + f[2] = warpReduceSum(f[2]); if (lane_id == 0) { - for (int i = 0; i < 3; i++) - { - s_data[warp_id * 3 + i] = f[i]; - } - } - __syncthreads(); - - for (int i = 0; i < 3; i++) - { - f[i] = (tid < blockDim.x / 32) ? s_data[tid * 3 + i] : 0; - } - if (warp_id == 0) - { - for (int i = 0; i < 3; i++) - { - f[i] = warpReduceSum(f[i]); - } - } - if (tid == 0) - { - for (int i = 0; i < 3; i++) - { - atomicAdd(&force[iat * 3 + i], f[i] * 2); - } + atomicAdd(&force[iat * 3 + 0], f[0] * 2); + atomicAdd(&force[iat * 3 + 1], f[1] * 2); + atomicAdd(&force[iat * 3 + 2], f[2] * 2); } } } @@ -539,14 +540,14 @@ __global__ void phi_dot_dphi_r_kernel( const int* __restrict__ atom_nw, double* __restrict__ svl) { - __shared__ double s_data[32 * 6]; // the length of s_data equals the max warp num of a block times 6 + // NOTE: this kernel assumes blockDim.x == 32 (a single warp). If the launch + // configuration is ever changed, the reduce below needs a shared-memory stage. const int tid = threadIdx.x; const int bgrid_id = blockIdx.y; const int atoms_num = atoms_num_info[bgrid_id].x; const int pre_atoms_num = atoms_num_info[bgrid_id].y; const int b_phi_len = bgrid_phi_len[bgrid_id]; - const int warp_id = tid / 32; - const int lane_id = tid % 32; + const int lane_id = tid; // blockDim.x == 32 double stress[6]{0.0}; for (int mgrid_id = blockIdx.x; mgrid_id < mgrids_per_bgrid; mgrid_id += gridDim.x) @@ -564,44 +565,29 @@ __global__ void phi_dot_dphi_r_kernel( for (int iw = tid; iw < nw; iw += blockDim.x) { int phi_idx = phi_start + iw; - stress[0] += phi[phi_idx] * dphi_x[phi_idx] * coord.x; - stress[1] += phi[phi_idx] * dphi_x[phi_idx] * coord.y; - stress[2] += phi[phi_idx] * dphi_x[phi_idx] * coord.z; - stress[3] += phi[phi_idx] * dphi_y[phi_idx] * coord.y; - stress[4] += phi[phi_idx] * dphi_y[phi_idx] * coord.z; - stress[5] += phi[phi_idx] * dphi_z[phi_idx] * coord.z; + const double p = phi[phi_idx]; + const double pdx = p * dphi_x[phi_idx]; + const double pdy = p * dphi_y[phi_idx]; + const double pdz = p * dphi_z[phi_idx]; + stress[0] += pdx * coord.x; + stress[1] += pdx * coord.y; + stress[2] += pdx * coord.z; + stress[3] += pdy * coord.y; + stress[4] += pdy * coord.z; + stress[5] += pdz * coord.z; } } } - // reduce the stress in each block + // single-warp reduce + #pragma unroll for (int i = 0; i < 6; i++) { stress[i] = warpReduceSum(stress[i]); } - if (lane_id == 0) { - for (int i = 0; i < 6; i++) - { - s_data[warp_id * 6 + i] = stress[i]; - } - } - __syncthreads(); - - for (int i = 0; i < 6; i++) - { - stress[i] = (tid < blockDim.x / 32) ? s_data[tid * 6 + i] : 0; - } - if (warp_id == 0) - { - for (int i = 0; i < 6; i++) - { - stress[i] = warpReduceSum(stress[i]); - } - } - if (tid == 0) - { + #pragma unroll for (int i = 0; i < 6; i++) { atomicAdd(&svl[i], stress[i] * 2); diff --git a/source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh index 9e51ae97c1..d670820a78 100644 --- a/source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh +++ b/source/source_lcao/module_gint/kernel/phi_operator_kernel.cuh @@ -28,6 +28,8 @@ __global__ void set_phi_kernel( const int* __restrict__ bgrid_phi_len, Real* __restrict__ phi); +// WantPhi == false: skip phi[] writes entirely (callers like gint_tau pass nullptr). +template __global__ void set_phi_dphi_kernel( const int nwmax, const int mgrids_num,