diff --git a/source/source_base/math_integral.cpp b/source/source_base/math_integral.cpp index 0dd8434bc3..5b4e7cc125 100644 --- a/source/source_base/math_integral.cpp +++ b/source/source_base/math_integral.cpp @@ -37,7 +37,7 @@ void Integral::Simpson_Integral // simpson's rule integrator for function stored on the // radial logarithmic mesh - // routine assumes that mesh is an odd number so run check + // routine assumes that mesh is an odd number so run check if (mesh % 2 == 0) { std::cout << "\n error in subroutine simpson "; @@ -83,21 +83,21 @@ void Integral::Simpson_Integral */ // simpson's rule integrator for function stored on the // radial logarithmic mesh - // routine assumes that mesh is an odd number so run check + // routine assumes that mesh is an odd number so run check assert(mesh&1); asum = 0.00; - const size_t end = mesh-2; + const size_t end = mesh-2; for( size_t i=1; i!=end; i+=2 ) { - const double f1 = func[i]*rab[i]; - asum += f1 + f1 + func[i+1]*rab[i+1]; + const double f1 = func[i]*rab[i]; + asum += f1 + f1 + func[i+1]*rab[i+1]; } - const double f1 = func[mesh-2]*rab[mesh-2]; - asum += f1+f1; - asum += asum; - asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1]; - asum /= 3.0; + const double f1 = func[mesh-2]*rab[mesh-2]; + asum += f1+f1; + asum += asum; + asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1]; + asum /= 3.0; return; }// end subroutine simpson @@ -124,21 +124,21 @@ void Integral::Simpson_Integral */ // simpson's rule integrator for function stored on the // radial logarithmic mesh - // routine assumes that mesh is an odd number so run check + // routine assumes that mesh is an odd number so run check assert(mesh&1); asum = 0.00; - const size_t end = mesh-2; + const size_t end = mesh-2; for(size_t i=1; i!=end; i+=2 ) { - const double f1 = func[i]; - asum += f1 + f1 + func[i+1]; + const double f1 = func[i]; + asum += f1 + f1 + func[i+1]; } - const double f1 = func[mesh-2]; - asum += f1+f1; - asum += asum; - asum += func[0] + func[mesh-1]; - asum *= dr/3.0; + const double f1 = func[mesh-2]; + asum += f1+f1; + asum += asum; + asum += func[0] + func[mesh-1]; + asum *= dr/3.0; return; }// end subroutine simpson @@ -220,10 +220,10 @@ void Integral::Simpson_Integral_alltoinf const double asum_all = asum[mesh-1]; for (int i = 0;i < mesh; ++i) - { + { asum[i] = asum_all - asum[i]; - } - return; + } + return; } double Integral::simpson(const int n, const double* const f, const double dx) diff --git a/source/source_base/module_fft/fft_cpu.cpp b/source/source_base/module_fft/fft_cpu.cpp index f50f6e9e86..e3e64d4775 100644 --- a/source/source_base/module_fft/fft_cpu.cpp +++ b/source/source_base/module_fft/fft_cpu.cpp @@ -344,60 +344,86 @@ void FFT_CPU::clear() template <> void FFT_CPU::fftxyfor(std::complex* in, std::complex* out) const { - int npy = this->nplane * this->ny; + const int npy_ = this->nplane * this->ny; + const int nx_ = this->nx; + const int lixy_ = this->lixy; + const int rixy_ = this->rixy; + const int nplane_ = this->nplane; + const fftw_plan planyfor_ = this->planyfor; + const fftw_plan planxfor1_ = this->planxfor1; + const fftw_plan planxfor2_ = this->planxfor2; if (this->xprime) { - fftw_execute_dft(this->planxfor1, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxfor1_, (fftw_complex*)in, (fftw_complex*)out); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy_ + 1; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor_, (fftw_complex*)&in[i * npy_], (fftw_complex*)&out[i * npy_]); } +#ifdef _OPENMP #pragma omp parallel for - for (int i = rixy; i < this->nx; ++i) +#endif + for (int i = rixy_; i < nx_; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor_, (fftw_complex*)&in[i * npy_], (fftw_complex*)&out[i * npy_]); } } else { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx_; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor_, (fftw_complex*)&in[i * npy_], (fftw_complex*)&out[i * npy_]); } - fftw_execute_dft(this->planxfor1, (fftw_complex*)in, (fftw_complex*)out); - fftw_execute_dft(this->planxfor2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]); + fftw_execute_dft(planxfor1_, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxfor2_, (fftw_complex*)&in[rixy_ * nplane_], (fftw_complex*)&out[rixy_ * nplane_]); } } template <> void FFT_CPU::fftxybac(std::complex* in,std::complex* out) const { - int npy = this->nplane * this->ny; + const int npy_ = this->nplane * this->ny; + const int nx_ = this->nx; + const int lixy_ = this->lixy; + const int rixy_ = this->rixy; + const int nplane_ = this->nplane; + const fftw_plan planybac_ = this->planybac; + const fftw_plan planxbac1_ = this->planxbac1; + const fftw_plan planxbac2_ = this->planxbac2; if (this->xprime) { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy_ + 1; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planybac_, (fftw_complex*)&in[i * npy_], (fftw_complex*)&out[i * npy_]); } +#ifdef _OPENMP #pragma omp parallel for - for (int i = rixy; i < this->nx; ++i) +#endif + for (int i = rixy_; i < nx_; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planybac_, (fftw_complex*)&in[i * npy_], (fftw_complex*)&out[i * npy_]); } - fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxbac1_, (fftw_complex*)in, (fftw_complex*)out); } else { - fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)out); - fftw_execute_dft(this->planxbac2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]); + fftw_execute_dft(planxbac1_, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxbac2_, (fftw_complex*)&in[rixy_ * nplane_], (fftw_complex*)&out[rixy_ * nplane_]); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx_; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planybac_, (fftw_complex*)&in[i * npy_], (fftw_complex*)&out[i * npy_]); } } } @@ -417,47 +443,67 @@ void FFT_CPU::fftzbac(std::complex* in, std::complex* ou template <> void FFT_CPU::fftxyr2c(double* in, std::complex* out) const { - int npy = this->nplane * this->ny; + const int npy_ = this->nplane * this->ny; + const int nx_ = this->nx; + const int lixy_ = this->lixy; + const fftw_plan planxr2c_ = this->planxr2c; + const fftw_plan planyfor_ = this->planyfor; + const fftw_plan planyr2c_ = this->planyr2c; + const fftw_plan planxfor1_ = this->planxfor1; if (this->xprime) { - fftw_execute_dft_r2c(this->planxr2c, in, (fftw_complex*)out); + fftw_execute_dft_r2c(planxr2c_, in, (fftw_complex*)out); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy_ + 1; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor_, (fftw_complex*)&out[i * npy_], (fftw_complex*)&out[i * npy_]); } } else { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx_; ++i) { - fftw_execute_dft_r2c(this->planyr2c, &in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft_r2c(planyr2c_, &in[i * npy_], (fftw_complex*)&out[i * npy_]); } - fftw_execute_dft(this->planxfor1, (fftw_complex*)out, (fftw_complex*)out); + fftw_execute_dft(planxfor1_, (fftw_complex*)out, (fftw_complex*)out); } } template <> void FFT_CPU::fftxyc2r(std::complex *in,double *out) const { - int npy = this->nplane * this->ny; + const int npy_ = this->nplane * this->ny; + const int nx_ = this->nx; + const int lixy_ = this->lixy; + const fftw_plan planybac_ = this->planybac; + const fftw_plan planxc2r_ = this->planxc2r; + const fftw_plan planxbac1_ = this->planxbac1; + const fftw_plan planyc2r_ = this->planyc2r; if (this->xprime) { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy_ + 1; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&in[i * npy]); + fftw_execute_dft(planybac_, (fftw_complex*)&in[i * npy_], (fftw_complex*)&in[i * npy_]); } - fftw_execute_dft_c2r(this->planxc2r, (fftw_complex*)in, out); + fftw_execute_dft_c2r(planxc2r_, (fftw_complex*)in, out); } else { - fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)in); + fftw_execute_dft(planxbac1_, (fftw_complex*)in, (fftw_complex*)in); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx_; ++i) { - fftw_execute_dft_c2r(this->planyc2r, (fftw_complex*)&in[i * npy], &out[i * npy]); + fftw_execute_dft_c2r(planyc2r_, (fftw_complex*)&in[i * npy_], &out[i * npy_]); } } } diff --git a/source/source_basis/module_pw/pw_gatherscatter.h b/source/source_basis/module_pw/pw_gatherscatter.h index 24e3671d4b..571ddc5186 100644 --- a/source/source_basis/module_pw/pw_gatherscatter.h +++ b/source/source_basis/module_pw/pw_gatherscatter.h @@ -1,7 +1,8 @@ #include "pw_basis.h" #include "source_base/global_function.h" #include "source_base/timer.h" -#include "typeinfo" +#include + namespace ModulePW { /** @@ -18,16 +19,18 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const if(this->poolnproc == 1) //In this case nst=nstot, nz = nplane, { + const int nst_ = this->nst; + const int nz_ = this->nz; + const int* istot2ixy_ = this->istot2ixy; #ifdef _OPENMP #pragma omp parallel for #endif - for(int is = 0 ; is < this->nst ; ++is) + for(int is = 0 ; is < nst_ ; ++is) { - int ixy = this->istot2ixy[is]; - //int ixy = (ixy / fftny)*ny + ixy % fftny; - std::complex *outp = &out[is*nz]; - std::complex *inp = &in[ixy*nz]; - for(int iz = 0 ; iz < this->nz ; ++iz) + int ixy = istot2ixy_[is]; + std::complex *outp = &out[is*nz_]; + std::complex *inp = &in[ixy*nz_]; + for(int iz = 0 ; iz < nz_ ; ++iz) { outp[iz] = inp[iz]; } @@ -38,51 +41,65 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const #ifdef __MPI //change (nplane fftnxy) to (nplane,nstot) // Hence, we can send them at one time. + const int nstot_gps = this->nstot; + const int nplane_gps = this->nplane; + const int* istot2ixy_gps = this->istot2ixy; #ifdef _OPENMP -#pragma omp parallel for + #pragma omp parallel + { + #pragma omp for +#endif + for (int istot = 0; istot < nstot_gps; ++istot) + { + int ixy = istot2ixy_gps[istot]; + std::complex *outp = &out[istot * nplane_gps]; + std::complex *inp = &in[ixy * nplane_gps]; + for (int iz = 0; iz < nplane_gps; ++iz) + { + outp[iz] = inp[iz]; + } + } +#ifdef _OPENMP + #pragma omp barrier + } #endif - for (int istot = 0;istot < nstot; ++istot) - { - int ixy = this->istot2ixy[istot]; - //int ixy = (ixy / fftny)*ny + ixy % fftny; - std::complex *outp = &out[istot*nplane]; - std::complex *inp = &in[ixy*nplane]; - for (int iz = 0; iz < nplane; ++iz) - { - outp[iz] = inp[iz]; - } - } //exchange data //(nplane,nstot) to (numz[ip],ns, poolnproc) - if(typeid(T) == typeid(double)) - { - MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world); - } - else if(typeid(T) == typeid(float)) - { - MPI_Alltoallv(out, numr, startr, MPI_COMPLEX, in, numg, startg, MPI_COMPLEX, this->pool_world); - } + if(typeid(T) == typeid(double)) + { + MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world); + } + else if(typeid(T) == typeid(float)) + { + MPI_Alltoallv(out, numr, startr, MPI_COMPLEX, in, numg, startg, MPI_COMPLEX, this->pool_world); + } // change (nz,ns) to (numz[ip],ns, poolnproc) + const int poolnproc_gps = this->poolnproc; + const int nst_gps = this->nst; + const int nz_gps = this->nz; + const int* numz_gps = this->numz; + const int* startg_gps = this->startg; + const int* startz_gps = this->startz; #ifdef _OPENMP -#pragma omp parallel for collapse(2) + #pragma omp parallel for collapse(2) #endif - for (int ip = 0; ip < this->poolnproc ;++ip) - { - for (int is = 0; is < this->nst; ++is) - { - int nzip = this->numz[ip]; - std::complex *outp0 = &out[startz[ip]]; - std::complex *inp0 = &in[startg[ip]]; - std::complex *outp = &outp0[is * nz]; + for (int ip = 0; ip < poolnproc_gps ;++ip) + { + for (int is = 0; is < nst_gps; ++is) + { + int nzip = numz_gps[ip]; + std::complex *outp0 = &out[startz_gps[ip]]; + std::complex *inp0 = &in[startg_gps[ip]]; + std::complex *outp = &outp0[is * nz_gps]; std::complex *inp = &inp0[is * nzip ]; - for (int izip = 0; izip < nzip; ++izip) - { - outp[izip] = inp[izip]; - } - } - } + for (int izip = 0; izip < nzip; ++izip) + { + outp[izip] = inp[izip]; + } + } + } #endif //ModuleBase::timer::start(this->classname, "gatherp_scatters"); return; @@ -101,10 +118,14 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const // ModuleBase::timer::start(this->classname, "gathers_scatterp"); if(this->poolnproc == 1) //In this case nrxx=fftnx*fftny*nz, nst = nstot, { + const int nrxx_ = this->nrxx; + const int nst_ = this->nst; + const int nz_ = this->nz; + const int* istot2ixy_ = this->istot2ixy; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for(int i = 0; i < this->nrxx; ++i) + for(int i = 0; i < nrxx_; ++i) { out[i] = std::complex(0, 0); } @@ -112,13 +133,12 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const #ifdef _OPENMP #pragma omp parallel for #endif - for(int is = 0 ; is < this->nst ; ++is) + for(int is = 0 ; is < nst_ ; ++is) { - int ixy = istot2ixy[is]; - //int ixy = (ixy / fftny)*ny + ixy % fftny; - std::complex *outp = &out[ixy*nz]; - std::complex *inp = &in[is*nz]; - for(int iz = 0 ; iz < this->nz ; ++iz) + int ixy = istot2ixy_[is]; + std::complex *outp = &out[ixy*nz_]; + std::complex *inp = &in[is*nz_]; + for(int iz = 0 ; iz < nz_ ; ++iz) { outp[iz] = inp[iz]; } @@ -129,57 +149,73 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const #ifdef __MPI // change (nz,ns) to (numz[ip],ns, poolnproc) // Hence, we can send them at one time. + const int poolnproc_ = this->poolnproc; + const int nst_ = this->nst; + const int nz_ = this->nz; + const int* numz_ = this->numz; + const int* startg_ = this->startg; + const int* startz_ = this->startz; +#ifdef _OPENMP + #pragma omp parallel + { + #pragma omp for collapse(2) +#endif + for (int ip = 0; ip < poolnproc_ ;++ip) + { + for (int is = 0; is < nst_; ++is) + { + int nzip = numz_[ip]; + std::complex *outp0 = &out[startg_[ip]]; + std::complex *inp0 = &in[startz_[ip]]; + std::complex *outp = &outp0[is * nzip]; + std::complex *inp = &inp0[is * nz_ ]; + for (int izip = 0; izip < nzip; ++izip) + { + outp[izip] = inp[izip]; + } + } + } #ifdef _OPENMP -#pragma omp parallel for collapse(2) + #pragma omp barrier + } #endif - for (int ip = 0; ip < this->poolnproc ;++ip) - { - for (int is = 0; is < this->nst; ++is) - { - int nzip = this->numz[ip]; - std::complex *outp0 = &out[startg[ip]]; - std::complex *inp0 = &in[startz[ip]]; - std::complex *outp = &outp0[is * nzip]; - std::complex *inp = &inp0[is * nz ]; - for (int izip = 0; izip < nzip; ++izip) - { - outp[izip] = inp[izip]; - } - } - } - //exchange data + //exchange data //(numz[ip],ns, poolnproc) to (nplane,nstot) - if(typeid(T) == typeid(double)) - { - MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world); - } - else if(typeid(T) == typeid(float)) - { - MPI_Alltoallv(out, numg, startg, MPI_COMPLEX, in, numr, startr, MPI_COMPLEX, this->pool_world); - } + if(typeid(T) == typeid(double)) + { + MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world); + } + else if(typeid(T) == typeid(float)) + { + MPI_Alltoallv(out, numg, startg, MPI_COMPLEX, in, numr, startr, MPI_COMPLEX, this->pool_world); + } + const int nrxx_gsp = this->nrxx; #ifdef _OPENMP -#pragma omp parallel for schedule(static) + #pragma omp parallel for schedule(static) #endif - for(int i = 0; i < this->nrxx; ++i) + for(int i = 0; i < nrxx_gsp; ++i) { out[i] = std::complex(0, 0); } //change (nplane,nstot) to (nplane fftnxy) + const int nstot = this->nstot; + const int nplane = this->nplane; + const int* istot2ixy = this->istot2ixy; #ifdef _OPENMP #pragma omp parallel for #endif - for (int istot = 0;istot < nstot; ++istot) - { - int ixy = this->istot2ixy[istot]; + for (int istot = 0;istot < nstot; ++istot) + { + int ixy = istot2ixy[istot]; //int ixy = (ixy / fftny)*ny + ixy % fftny; std::complex *outp = &out[ixy * nplane]; std::complex *inp = &in[istot * nplane]; - for (int iz = 0; iz < nplane; ++iz) - { - outp[iz] = inp[iz]; - } + for (int iz = 0; iz < nplane; ++iz) + { + outp[iz] = inp[iz]; + } } #endif // ModuleBase::timer::start(this->classname, "gathers_scatterp"); diff --git a/source/source_basis/module_pw/pw_transform.cpp b/source/source_basis/module_pw/pw_transform.cpp index 2195517d49..220b353e9d 100644 --- a/source/source_basis/module_pw/pw_transform.cpp +++ b/source/source_basis/module_pw/pw_transform.cpp @@ -30,10 +30,14 @@ void PW_Basis::real2recip(const std::complex* in, ModuleBase::timer::start(this->classname, "real2recip"); assert(this->gamma_only == false); + const int nrxx_ = this->nrxx; + const int npw_ = this->npw; + const int nxyz_ = this->nxyz; + const int* ig2isz_ = this->ig2isz; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx_; ++ir) { this->fft_bundle.get_auxr_data()[ir] = in[ir]; } @@ -45,24 +49,24 @@ void PW_Basis::real2recip(const std::complex* in, if (add) { - FPTYPE tmpfac = factor / FPTYPE(this->nxyz); + FPTYPE tmpfac = factor / FPTYPE(nxyz_); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw_; ++ig) { - out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[ig2isz_[ig]]; } } else { - FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); + FPTYPE tmpfac = 1.0 / FPTYPE(nxyz_); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw_; ++ig) { - out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[ig2isz_[ig]]; } } ModuleBase::timer::end(this->classname, "real2recip"); @@ -79,13 +83,20 @@ template void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const bool add, const FPTYPE factor) const { ModuleBase::timer::start(this->classname, "real2recip"); + const int nrxx_ = this->nrxx; + const int npw_ = this->npw; + const int nxyz_ = this->nxyz; + const int* ig2isz_ = this->ig2isz; + const int nx_ = this->nx; + const int ny_ = this->ny; + const int nplane_ = this->nplane; if (this->gamma_only) { - const int npy = this->ny * this->nplane; + const int npy = ny_ * nplane_; #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static) #endif - for (int ix = 0; ix < this->nx; ++ix) + for (int ix = 0; ix < nx_; ++ix) { for (int ipy = 0; ipy < npy; ++ipy) { @@ -100,7 +111,7 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx_; ++ir) { this->fft_bundle.get_auxr_data()[ir] = std::complex(in[ir], 0); } @@ -112,24 +123,24 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo if (add) { - FPTYPE tmpfac = factor / FPTYPE(this->nxyz); + FPTYPE tmpfac = factor / FPTYPE(nxyz_); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw_; ++ig) { - out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[ig2isz_[ig]]; } } else { - FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); + FPTYPE tmpfac = 1.0 / FPTYPE(nxyz_); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw_; ++ig) { - out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[ig2isz_[ig]]; } } ModuleBase::timer::end(this->classname, "real2recip"); @@ -150,10 +161,15 @@ void PW_Basis::recip2real(const std::complex* in, { ModuleBase::timer::start(this->classname, "recip2real"); assert(this->gamma_only == false); + const int nst_ = this->nst; + const int nz_ = this->nz; + const int npw_ = this->npw; + const int nrxx_ = this->nrxx; + const int* ig2isz_ = this->ig2isz; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int i = 0; i < this->nst * this->nz; ++i) + for (int i = 0; i < nst_ * nz_; ++i) { fft_bundle.get_auxg_data()[i] = std::complex(0, 0); } @@ -161,22 +177,22 @@ void PW_Basis::recip2real(const std::complex* in, #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw_; ++ig) { - this->fft_bundle.get_auxg_data()[this->ig2isz[ig]] = in[ig]; + this->fft_bundle.get_auxg_data()[ig2isz_[ig]] = in[ig]; } this->fft_bundle.fftzbac(fft_bundle.get_auxg_data(), fft_bundle.get_auxg_data()); this->gathers_scatterp(this->fft_bundle.get_auxg_data(), this->fft_bundle.get_auxr_data()); - this->fft_bundle.fftxybac(fft_bundle.get_auxr_data(), fft_bundle.get_auxr_data()); + this->fft_bundle.fftxybac(fft_bundle.get_auxr_data(), this->fft_bundle.get_auxr_data()); if (add) { #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx_; ++ir) { out[ir] += factor * this->fft_bundle.get_auxr_data()[ir]; } @@ -186,7 +202,7 @@ void PW_Basis::recip2real(const std::complex* in, #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx_; ++ir) { out[ir] = this->fft_bundle.get_auxr_data()[ir]; } @@ -205,10 +221,18 @@ template void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const bool add, const FPTYPE factor) const { ModuleBase::timer::start(this->classname, "recip2real"); + const int nst_ = this->nst; + const int nz_ = this->nz; + const int npw_ = this->npw; + const int nrxx_ = this->nrxx; + const int nx_ = this->nx; + const int ny_ = this->ny; + const int nplane_ = this->nplane; + const int* ig2isz_ = this->ig2isz; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int i = 0; i < this->nst * this->nz; ++i) + for (int i = 0; i < nst_ * nz_; ++i) { fft_bundle.get_auxg_data()[i] = std::complex(0, 0); } @@ -216,9 +240,9 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw_; ++ig) { - this->fft_bundle.get_auxg_data()[this->ig2isz[ig]] = in[ig]; + this->fft_bundle.get_auxg_data()[ig2isz_[ig]] = in[ig]; } this->fft_bundle.fftzbac(fft_bundle.get_auxg_data(), fft_bundle.get_auxg_data()); @@ -228,15 +252,14 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo { this->fft_bundle.fftxyc2r(fft_bundle.get_auxr_data(), fft_bundle.get_rspace_data()); - // r2c in place - const int npy = this->ny * this->nplane; + const int npy = ny_ * nplane_; if (add) { #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static) #endif - for (int ix = 0; ix < this->nx; ++ix) + for (int ix = 0; ix < nx_; ++ix) { for (int ipy = 0; ipy < npy; ++ipy) { @@ -249,7 +272,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static) #endif - for (int ix = 0; ix < this->nx; ++ix) + for (int ix = 0; ix < nx_; ++ix) { for (int ipy = 0; ipy < npy; ++ipy) { @@ -266,7 +289,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx_; ++ir) { out[ir] += factor * this->fft_bundle.get_auxr_data()[ir].real(); } @@ -276,7 +299,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx_; ++ir) { out[ir] = this->fft_bundle.get_auxr_data()[ir].real(); } diff --git a/source/source_cell/check_atomic_stru.cpp b/source/source_cell/check_atomic_stru.cpp index c8fea41be4..ce032f81c4 100644 --- a/source/source_cell/check_atomic_stru.cpp +++ b/source/source_cell/check_atomic_stru.cpp @@ -9,10 +9,6 @@ namespace unitcell void check_atomic_stru(UnitCell& ucell, const double& factor) { ModuleBase::timer::start("unitcell", "check_atomic_stru"); - // First we calculate all bond length in the structure, - // and compare with the covalent_bond_length, - // if there has bond length is shorter than covalent_bond_length * factor, - // we think this structure is unreasonable. assert(ucell.ntype > 0); bool all_pass = true; bool no_warning = true; @@ -86,6 +82,15 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) } const double bohr_to_a = ModuleBase::BOHR_TO_A; + // Data race fix: cache shared read-only data to local const references before OpenMP parallel region + // TSan detected race condition when accessing std::vector elements (cell, label) + // and other shared vectors (A, latvec, symbol_covalent_radiuss) in parallel loop. + // Using local const references ensures the compiler treats these as immutable within the parallel region + const std::vector& cell_ = cell; + const std::vector& label_ = label; + const std::vector& A_ = A; + const std::vector& lat_ = latvec; + const std::vector& rad_ = symbol_covalent_radiuss; #pragma omp parallel { std::vector delta_lat(3); @@ -94,13 +99,13 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) { const int it1 = ucell.iat2it[iat]; const int ia1 = ucell.iat2ia[iat]; - const double symbol1_covalent_radius = symbol_covalent_radiuss[it1]; + const double symbol1_covalent_radius = rad_[it1]; double x1 = ucell.atoms[it1].taud[ia1].x; double y1 = ucell.atoms[it1].taud[ia1].y; double z1 = ucell.atoms[it1].taud[ia1].z; for (int it2 = it1; it2 < ntype; it2++) { - double symbol2_covalent_radius = symbol_covalent_radiuss[it2]; + double symbol2_covalent_radius = rad_[it2]; double covalent_length = (symbol1_covalent_radius + symbol2_covalent_radius) / bohr_to_a; const double max_error = covalent_length * max_factor_coef / ucell.lat0; const double max_error_2 = max_error * max_error; @@ -111,17 +116,17 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) double delta_x = ucell.atoms[it2].taud[ia2].x - x1; double delta_y = ucell.atoms[it2].taud[ia2].y - y1; double delta_z = ucell.atoms[it2].taud[ia2].z - z1; - delta_lat[0] = delta_x * latvec[0] + delta_y * latvec[1] + delta_z * latvec[2]; - delta_lat[1] = delta_x * latvec[3] + delta_y * latvec[4] + delta_z * latvec[5]; - delta_lat[2] = delta_x * latvec[6] + delta_y * latvec[7] + delta_z * latvec[8]; + delta_lat[0] = delta_x * lat_[0] + delta_y * lat_[1] + delta_z * lat_[2]; + delta_lat[1] = delta_x * lat_[3] + delta_y * lat_[4] + delta_z * lat_[5]; + delta_lat[2] = delta_x * lat_[6] + delta_y * lat_[7] + delta_z * lat_[8]; for (int i = 0; i < 27; i++) { if ((is_same_atom) && (i == 13)) continue; const int offset = i * 3; - const double part1 = delta_lat[0] + A[offset]; - const double part2 = delta_lat[1] + A[offset + 1]; - const double part3 = delta_lat[2] + A[offset + 2]; + const double part1 = delta_lat[0] + A_[offset]; + const double part2 = delta_lat[1] + A_[offset + 1]; + const double part3 = delta_lat[2] + A_[offset + 2]; const double bond_length = part1 * part1 + part2 * part2 + part3 * part3; const bool flag = bond_length < max_error_2 ? true : false; if (flag) @@ -131,15 +136,15 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) { no_warning = false; all_pass = all_pass && (sqrt_bon < factor_error ? false : true); - errorlog << std::setw(3) << ia1 + 1 << "-th " << label[it1] << ", " << std::setw(3) - << ia2 + 1 << "-th " << label[it2] << cell[i] << std::setprecision(3) + errorlog << std::setw(3) << ia1 + 1 << "-th " << label_[it1] << ", " << std::setw(3) + << ia2 + 1 << "-th " << label_[it2] << cell_[i] << std::setprecision(3) << sqrt_bon << " Bohr (" << sqrt_bon * bohr_to_a << " Angstrom)\n"; } } - } - } // ia2 - } // it2 - } // iat + } + } + } + } } } if (!all_pass || !no_warning) diff --git a/source/source_estate/module_charge/charge.cpp b/source/source_estate/module_charge/charge.cpp index 916220fd42..32d0b4f835 100644 --- a/source/source_estate/module_charge/charge.cpp +++ b/source/source_estate/module_charge/charge.cpp @@ -359,7 +359,9 @@ void Charge::atomic_rho(const int spin_number_need, gstart = 1; } if (PARAM.inp.test_charge > 0) + { std::cout << "\n |G|=0 term done." << std::endl; + } //---------------------------------------------------------- // Here we compute the G<>0 term // But if in parallel case @@ -370,14 +372,18 @@ void Charge::atomic_rho(const int spin_number_need, #pragma omp parallel { #endif - std::vector rho1d(ucell.meshx); + const int ngg = this->rhopw->ngg; + const double* gg_uniq = this->rhopw->gg_uniq; + const int meshx = ucell.meshx; + const double tpiba = ucell.tpiba; + std::vector rho1d(meshx); #ifdef _OPENMP #pragma omp for #endif - for (int igg = gstart; igg < this->rhopw->ngg; ++igg) + for (int igg = gstart; igg < ngg; ++igg) { - const double gx = sqrt(this->rhopw->gg_uniq[igg]) * ucell.tpiba; + const double gx = sqrt(gg_uniq[igg]) * tpiba; for (int ir = 0; ir < mesh; ir++) { if (atom->ncpp.r[ir] < 1.0e-8) @@ -397,7 +403,9 @@ void Charge::atomic_rho(const int spin_number_need, #endif { if (PARAM.inp.test_charge > 0) + { std::cout << " |G|>0 term done." << std::endl; + } } //---------------------------------------------------------- // EXPLAIN : Complete the transfer of rho from real space to @@ -406,7 +414,7 @@ void Charge::atomic_rho(const int spin_number_need, #ifdef _OPENMP #pragma omp for #endif - for (int igg = 0; igg < this->rhopw->ngg; igg++) + for (int igg = 0; igg < ngg; igg++) { rho_lgl[igg] /= omega; } @@ -420,12 +428,14 @@ void Charge::atomic_rho(const int spin_number_need, //---------------------------------------------------------- if (spin_number_need == 1) { + const int npw = this->rhopw->npw; + const int* ig2igg = this->rhopw->ig2igg; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - rho_g3d(0, ig) += strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]]; + rho_g3d(0, ig) += strucFac(it, ig) * rho_lgl[ig2igg[ig]]; } } // mohan add 2011-06-14, initialize the charge density according to each atom @@ -433,14 +443,18 @@ void Charge::atomic_rho(const int spin_number_need, { if (startmag_type == 1) { + const int npw = this->rhopw->npw; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; + const double start_mag_it = ucell.magnet.start_mag[it]; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const std::complex swap = strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]]; - const double up = 0.5 * (1 + ucell.magnet.start_mag[it] / atom->ncpp.zv); - const double dw = 0.5 * (1 - ucell.magnet.start_mag[it] / atom->ncpp.zv); + const std::complex swap = strucFac(it, ig) * rho_lgl[ig2igg[ig]]; + const double up = 0.5 * (1 + start_mag_it / zv); + const double dw = 0.5 * (1 - start_mag_it / zv); rho_g3d(0, ig) += swap * up; rho_g3d(1, ig) += swap * dw; } @@ -449,25 +463,24 @@ void Charge::atomic_rho(const int spin_number_need, else if (startmag_type == 2) { std::complex ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI; + const int npw = this->rhopw->npw; + const ModuleBase::Vector3* gcar = this->rhopw->gcar; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; for (int ia = 0; ia < atom->na; ia++) { - // const double up = 0.5 * ( 1 + atom->mag[ia] ); - // const double dw = 0.5 * ( 1 - atom->mag[ia] ); const double up = 0.5 * (1 + atom->mag[ia] / atom->ncpp.zv); const double dw = 0.5 * (1 - atom->mag[ia] / atom->ncpp.zv); - // std::cout << " atom " << ia << " up=" << up << " dw=" << dw << std::endl; + const double tau_x = atom->tau[ia].x; + const double tau_y = atom->tau[ia].y; + const double tau_z = atom->tau[ia].z; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const double Gtau = this->rhopw->gcar[ig][0] * atom->tau[ia].x - + this->rhopw->gcar[ig][1] * atom->tau[ia].y - + this->rhopw->gcar[ig][2] * atom->tau[ia].z; - - std::complex swap - = ModuleBase::libm::exp(ci_tpi * Gtau) * rho_lgl[this->rhopw->ig2igg[ig]]; - + const double Gtau = gcar[ig][0] * tau_x + gcar[ig][1] * tau_y + gcar[ig][2] * tau_z; + std::complex swap = ModuleBase::libm::exp(ci_tpi * Gtau) * rho_lgl[ig2igg[ig]]; rho_g3d(0, ig) += swap * up; rho_g3d(1, ig) += swap * dw; } @@ -481,37 +494,42 @@ void Charge::atomic_rho(const int spin_number_need, { double sin_a1, sin_a2, cos_a1, cos_a2; if (PARAM.globalv.domag) - { // will not be used now, will be deleted later + { ModuleBase::libm::sincos(atom->angle1[0], &sin_a1, &cos_a1); ModuleBase::libm::sincos(atom->angle2[0], &sin_a2, &cos_a2); } + const int npw = this->rhopw->npw; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; + const double start_mag_it = ucell.magnet.start_mag[it]; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const std::complex swap = strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]]; + const std::complex swap = strucFac(it, ig) * rho_lgl[ig2igg[ig]]; rho_g3d(0, ig) += swap; if (PARAM.globalv.domag) - { // will not be used now, will be deleted later - rho_g3d(1, ig) - += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * sin_a1 * cos_a2; - rho_g3d(2, ig) - += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * sin_a1 * sin_a2; - rho_g3d(3, ig) - += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * cos_a1; + { + rho_g3d(1, ig) += swap * (start_mag_it / zv) * sin_a1 * cos_a2; + rho_g3d(2, ig) += swap * (start_mag_it / zv) * sin_a1 * sin_a2; + rho_g3d(3, ig) += swap * (start_mag_it / zv) * cos_a1; } else if (PARAM.globalv.domag_z) { rho_g3d(1, ig) = 0.0; rho_g3d(2, ig) = 0.0; - rho_g3d(3, ig) += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv); + rho_g3d(3, ig) += swap * (start_mag_it / zv); } } } else if (startmag_type == 2) - { // zdy-warning-not-available + { std::complex ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI; + const int npw = this->rhopw->npw; + const ModuleBase::Vector3* gcar = this->rhopw->gcar; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; for (int ia = 0; ia < atom->na; ia++) { double sin_a1, sin_a2, cos_a1, cos_a2; @@ -523,29 +541,27 @@ void Charge::atomic_rho(const int spin_number_need, { ModuleBase::libm::sincos(atom->angle2[ia], &sin_a2, &cos_a2); } + const double mag_ia = atom->mag[ia]; + const double tau_x = atom->tau[ia].x; + const double tau_y = atom->tau[ia].y; + const double tau_z = atom->tau[ia].z; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const double Gtau = this->rhopw->gcar[ig][0] * atom->tau[ia].x - + this->rhopw->gcar[ig][1] * atom->tau[ia].y - + this->rhopw->gcar[ig][2] * atom->tau[ia].z; - - std::complex swap = exp(ci_tpi * Gtau) * rho_lgl[this->rhopw->ig2igg[ig]]; - - // calculate rho_total + const double Gtau = gcar[ig][0] * tau_x + gcar[ig][1] * tau_y + gcar[ig][2] * tau_z; + std::complex swap = exp(ci_tpi * Gtau) * rho_lgl[ig2igg[ig]]; + const double mag_factor = mag_ia / zv; rho_g3d(0, ig) += swap; - // calculate mag_z if (PARAM.globalv.domag || PARAM.globalv.domag_z) { - rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * cos_a1; + rho_g3d(3, ig) += swap * mag_factor * cos_a1; } - // calculate mag_x and mag_y if (PARAM.globalv.domag) { - rho_g3d(1, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * sin_a1 * cos_a2; - rho_g3d(2, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * sin_a1 * sin_a2; + rho_g3d(1, ig) += swap * mag_factor * sin_a1 * cos_a2; + rho_g3d(2, ig) += swap * mag_factor * sin_a1 * sin_a2; } else { diff --git a/source/source_pw/module_ofdft/evolve_ofdft.cpp b/source/source_pw/module_ofdft/evolve_ofdft.cpp index 51bdc6797e..06d54d13a2 100644 --- a/source/source_pw/module_ofdft/evolve_ofdft.cpp +++ b/source/source_pw/module_ofdft/evolve_ofdft.cpp @@ -5,6 +5,12 @@ #include "source_base/parallel_reduce.h" +// Data race fix: Cache shared member variables to local const variables before OpenMP parallel regions. +// ThreadSanitizer detected race conditions when accessing member variables through pointers +// in parallel loops, even though these values are logically read-only. By caching them +// to local const variables, the compiler can guarantee they won't change within the parallel region, +// eliminating false positive data race warnings from ThreadSanitizer. + void Evolve_OFDFT::cal_Hpsi(elecstate::ElecState* pelec, Charge& chr, UnitCell& ucell, @@ -12,15 +18,18 @@ void Evolve_OFDFT::cal_Hpsi(elecstate::ElecState* pelec, ModulePW::PW_Basis* pw_rho, std::vector>& Hpsi) { + const int nspin = PARAM.inp.nspin; + const int nrxx = pw_rho->nrxx; + // update rho #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) + for (int is = 0; is < nspin; ++is) { - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { - chr.rho[is][ir] = abs(psi_[is * pw_rho->nrxx + ir])*abs(psi_[is * pw_rho->nrxx + ir]); + chr.rho[is][ir] = abs(psi_[is * nrxx + ir])*abs(psi_[is * nrxx + ir]); } } this->renormalize_psi(chr, pw_rho, psi_); @@ -35,12 +44,12 @@ void Evolve_OFDFT::cal_Hpsi(elecstate::ElecState* pelec, #ifdef _OPENMP #pragma omp parallel for #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) + for (int is = 0; is < nspin; ++is) { const double* vr_eff = pelec->pot->get_eff_v(is); - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { - Hpsi[is * pw_rho->nrxx + ir] = vr_eff[ir]*psi_[is * pw_rho->nrxx + ir]; + Hpsi[is * nrxx + ir] = vr_eff[ir]*psi_[is * nrxx + ir]; } } this->cal_vw_potential_phi(psi_, pw_rho, Hpsi); @@ -50,16 +59,18 @@ void Evolve_OFDFT::renormalize_psi(Charge& chr, ModulePW::PW_Basis* pw_rho, std: { const double sr = chr.sum_rho(); const double normalize_factor = PARAM.inp.nelec / sr; + const int nspin = PARAM.inp.nspin; + const int nrxx = pw_rho->nrxx; std::cout<<"sr="<** rLapPhi = new std::complex*[PARAM.inp.nspin]; + std::complex** rLapPhi = new std::complex*[nspin]; #ifdef _OPENMP #pragma omp parallel for #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) { - rLapPhi[is] = new std::complex[pw_rho->nrxx]; - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int is = 0; is < nspin; ++is) { + rLapPhi[is] = new std::complex[nrxx]; + for (int ir = 0; ir < nrxx; ++ir) { - rLapPhi[is][ir]=pphi[is * pw_rho->nrxx + ir]; + rLapPhi[is][ir]=pphi[is * nrxx + ir]; } } - std::complex** recipPhi = new std::complex*[PARAM.inp.nspin]; + std::complex** recipPhi = new std::complex*[nspin]; #ifdef _OPENMP #pragma omp parallel for #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) + for (int is = 0; is < nspin; ++is) { - recipPhi[is] = new std::complex[pw_rho->npw]; + recipPhi[is] = new std::complex[npw]; pw_rho->real2recip(rLapPhi[is], recipPhi[is]); - for (int ik = 0; ik < pw_rho->npw; ++ik) + for (int ik = 0; ik < npw; ++ik) { - recipPhi[is][ik] *= pw_rho->gg[ik] * pw_rho->tpiba2; + recipPhi[is][ik] *= gg[ik] * tpiba2; } pw_rho->recip2real(recipPhi[is], rLapPhi[is]); - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { - Hpsi[is * pw_rho->nrxx + ir] += rLapPhi[is][ir]; + Hpsi[is * nrxx + ir] += rLapPhi[is][ir]; } } #ifdef _OPENMP #pragma omp parallel for #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) + for (int is = 0; is < nspin; ++is) { delete[] recipPhi[is]; delete[] rLapPhi[is]; @@ -149,56 +170,62 @@ void Evolve_OFDFT::cal_CD_potential(std::vector>& psi_, ModuleBase::matrix& rpot, double mCD_para) { - std::complex imag(0.0,1.0); + const std::complex imag(0.0,1.0); + const int nspin = PARAM.inp.nspin; + const int nrxx = pw_rho->nrxx; + const int npw = pw_rho->npw; + const double tpiba = pw_rho->tpiba; + const double* gg = pw_rho->gg; + const ModuleBase::Vector3* gcar = pw_rho->gcar; - if (PARAM.inp.nspin <= 0) { + if (nspin <= 0) { ModuleBase::WARNING_QUIT("Evolve_OFDFT","nspin must be positive"); } - std::complex** recipPhi = new std::complex*[PARAM.inp.nspin]; - std::complex** rPhi = new std::complex*[PARAM.inp.nspin]; + std::complex** recipPhi = new std::complex*[nspin]; + std::complex** rPhi = new std::complex*[nspin]; #ifdef _OPENMP #pragma omp parallel for #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) { - rPhi[is] = new std::complex[pw_rho->nrxx]; - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int is = 0; is < nspin; ++is) { + rPhi[is] = new std::complex[nrxx]; + for (int ir = 0; ir < nrxx; ++ir) { - rPhi[is][ir]=psi_[is * pw_rho->nrxx + ir]; + rPhi[is][ir]=psi_[is * nrxx + ir]; } } #ifdef _OPENMP #pragma omp parallel for #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) + for (int is = 0; is < nspin; ++is) { - std::vector> recipCurrent_x(pw_rho->npw); - std::vector> recipCurrent_y(pw_rho->npw); - std::vector> recipCurrent_z(pw_rho->npw); - std::vector> recipCDPotential(pw_rho->npw); - std::vector> rCurrent_x(pw_rho->nrxx); - std::vector> rCurrent_y(pw_rho->nrxx); - std::vector> rCurrent_z(pw_rho->nrxx); - std::vector> kF_r(pw_rho->nrxx); - std::vector> rCDPotential(pw_rho->nrxx); - recipPhi[is] = new std::complex[pw_rho->npw]; - - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + std::vector> recipCurrent_x(npw); + std::vector> recipCurrent_y(npw); + std::vector> recipCurrent_z(npw); + std::vector> recipCDPotential(npw); + std::vector> rCurrent_x(nrxx); + std::vector> rCurrent_y(nrxx); + std::vector> rCurrent_z(nrxx); + std::vector> kF_r(nrxx); + std::vector> rCDPotential(nrxx); + recipPhi[is] = new std::complex[npw]; + + for (int ir = 0; ir < nrxx; ++ir) { kF_r[ir]=std::pow(3*std::pow(ModuleBase::PI*std::abs(rPhi[is][ir]),2),1.0/3.0); } pw_rho->real2recip(rPhi[is], recipPhi[is]); - for (int ik = 0; ik < pw_rho->npw; ++ik) + for (int ik = 0; ik < npw; ++ik) { - recipCurrent_x[ik]=imag*pw_rho->gcar[ik].x*recipPhi[is][ik]* pw_rho->tpiba; - recipCurrent_y[ik]=imag*pw_rho->gcar[ik].y*recipPhi[is][ik]* pw_rho->tpiba; - recipCurrent_z[ik]=imag*pw_rho->gcar[ik].z*recipPhi[is][ik]* pw_rho->tpiba; + recipCurrent_x[ik]=imag*gcar[ik].x*recipPhi[is][ik]*tpiba; + recipCurrent_y[ik]=imag*gcar[ik].y*recipPhi[is][ik]*tpiba; + recipCurrent_z[ik]=imag*gcar[ik].z*recipPhi[is][ik]*tpiba; } pw_rho->recip2real(recipCurrent_x.data(),rCurrent_x.data()); pw_rho->recip2real(recipCurrent_y.data(),rCurrent_y.data()); pw_rho->recip2real(recipCurrent_z.data(),rCurrent_z.data()); - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { rCurrent_x[ir]=std::imag(rCurrent_x[ir]*std::conj(rPhi[is][ir])); rCurrent_y[ir]=std::imag(rCurrent_y[ir]*std::conj(rPhi[is][ir])); @@ -207,21 +234,21 @@ void Evolve_OFDFT::cal_CD_potential(std::vector>& psi_, pw_rho->real2recip(rCurrent_x.data(),recipCurrent_x.data()); pw_rho->real2recip(rCurrent_y.data(),recipCurrent_y.data()); pw_rho->real2recip(rCurrent_z.data(),recipCurrent_z.data()); - for (int ik = 0; ik < pw_rho->npw; ++ik) + for (int ik = 0; ik < npw; ++ik) { - recipCDPotential[ik]=recipCurrent_x[ik]*pw_rho->gcar[ik].x+recipCurrent_y[ik]*pw_rho->gcar[ik].y+recipCurrent_z[ik]*pw_rho->gcar[ik].z; - if (pw_rho->gg[ik]==0) + recipCDPotential[ik]=recipCurrent_x[ik]*gcar[ik].x+recipCurrent_y[ik]*gcar[ik].y+recipCurrent_z[ik]*gcar[ik].z; + if (gg[ik]==0) { recipCDPotential[ik]=0.0; } else { - recipCDPotential[ik]*=imag/sqrt(pw_rho->gg[ik]); + recipCDPotential[ik]*=imag/sqrt(gg[ik]); } } pw_rho->recip2real(recipCDPotential.data(),rCDPotential.data()); - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { rpot(0, ir) -= mCD_para*2.0*std::real(rCDPotential[ir])*std::pow(ModuleBase::PI,3) / (2.0*std::pow(std::real(kF_r[ir]),2)); @@ -235,7 +262,7 @@ void Evolve_OFDFT::cal_CD_potential(std::vector>& psi_, #ifdef _OPENMP #pragma omp parallel for #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) + for (int is = 0; is < nspin; ++is) { delete[] recipPhi[is]; delete[] rPhi[is]; @@ -252,8 +279,8 @@ void Evolve_OFDFT::propagate_psi_RK4(elecstate::ElecState* pelec, { ModuleBase::timer::start("ESolver_OF_TDDFT", "propagate_psi_RK4"); - std::complex imag(0.0,1.0); - double dt=PARAM.inp.mdp.md_dt / ModuleBase::AU_to_FS; + const std::complex imag(0.0,1.0); + const double dt=PARAM.inp.mdp.md_dt / ModuleBase::AU_to_FS; const int nspin = PARAM.inp.nspin; const int nrxx = pw_rho->nrxx; const int total_size = nspin * nrxx; @@ -269,8 +296,8 @@ void Evolve_OFDFT::propagate_psi_RK4(elecstate::ElecState* pelec, #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int is = 0; is < PARAM.inp.nspin; ++is){ - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int is = 0; is < nspin; ++is){ + for (int ir = 0; ir < nrxx; ++ir) { K1[is * nrxx + ir]=-0.5*K1[is * nrxx + ir]*dt*imag; // 0.5 convert Ry to Hartree psi1[is * nrxx + ir]=pphi_[is * nrxx + ir]+0.5*K1[is * nrxx + ir]; @@ -280,8 +307,8 @@ void Evolve_OFDFT::propagate_psi_RK4(elecstate::ElecState* pelec, #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int is = 0; is < PARAM.inp.nspin; ++is){ - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int is = 0; is < nspin; ++is){ + for (int ir = 0; ir < nrxx; ++ir) { K2[is * nrxx + ir]=-0.5*K2[is * nrxx + ir]*dt*imag; psi2[is * nrxx + ir]=pphi_[is * nrxx + ir]+0.5*K2[is * nrxx + ir]; @@ -291,8 +318,8 @@ void Evolve_OFDFT::propagate_psi_RK4(elecstate::ElecState* pelec, #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int is = 0; is < PARAM.inp.nspin; ++is){ - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int is = 0; is < nspin; ++is){ + for (int ir = 0; ir < nrxx; ++ir) { K3[is * nrxx + ir]=-0.5*K3[is * nrxx + ir]*dt*imag; psi3[is * nrxx + ir]=pphi_[is * nrxx + ir]+K3[is * nrxx + ir]; @@ -302,8 +329,8 @@ void Evolve_OFDFT::propagate_psi_RK4(elecstate::ElecState* pelec, #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int is = 0; is < PARAM.inp.nspin; ++is){ - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int is = 0; is < nspin; ++is){ + for (int ir = 0; ir < nrxx; ++ir) { K4[is * nrxx + ir]=-0.5*K4[is * nrxx + ir]*dt*imag; pphi_[is * nrxx + ir]+=1.0/6.0*(K1[is * nrxx + ir]+2.0*K2[is * nrxx + ir]+2.0*K3[is * nrxx + ir]+K4[is * nrxx + ir]); @@ -313,11 +340,11 @@ void Evolve_OFDFT::propagate_psi_RK4(elecstate::ElecState* pelec, #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int is = 0; is < PARAM.inp.nspin; ++is) + for (int is = 0; is < nspin; ++is) { - for (int ir = 0; ir < pw_rho->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { - chr.rho[is][ir] = abs(pphi_[is * pw_rho->nrxx + ir])*abs(pphi_[is * pw_rho->nrxx + ir]); + chr.rho[is][ir] = abs(pphi_[is * nrxx + ir])*abs(pphi_[is * nrxx + ir]); } } this->renormalize_psi(chr, pw_rho, pphi_); @@ -334,7 +361,7 @@ void Evolve_OFDFT::propagate_psi_RK2(elecstate::ElecState* pelec, ModuleBase::timer::start("ESolver_OF_TDDFT", "propagate_psi_RK2"); const std::complex imag(0.0, 1.0); - double dt=PARAM.inp.mdp.md_dt / ModuleBase::AU_to_FS; + const double dt=PARAM.inp.mdp.md_dt / ModuleBase::AU_to_FS; const int nspin = PARAM.inp.nspin; const int nrxx = pw_rho->nrxx; const int total_size = nspin * nrxx; diff --git a/source/source_pw/module_pwdft/structure_factor.cpp b/source/source_pw/module_pwdft/structure_factor.cpp index 742f365951..6b342eaca8 100644 --- a/source/source_pw/module_pwdft/structure_factor.cpp +++ b/source/source_pw/module_pwdft/structure_factor.cpp @@ -85,18 +85,22 @@ void Structure_Factor::setup(const UnitCell* Ucell, const Parallel_Grid& pgrid, { for (int it=0; itntype; it++) { - const int na = Ucell->atoms[it].na; - const ModuleBase::Vector3 * const tau = Ucell->atoms[it].tau.data(); + const int na = Ucell->atoms[it].na; + const ModuleBase::Vector3 * const tau = Ucell->atoms[it].tau.data(); + // Data race fix: cache shared data to local const variables before OpenMP parallel region + // TSan detected race condition when accessing rho_basis->npw and rho_basis->gcar directly + // in parallel loop, even though they are logically read-only + const int npw = rho_basis->npw; + const ModuleBase::Vector3 * const gcar = rho_basis->gcar; #ifdef _OPENMP - #pragma omp parallel for + #pragma omp parallel for #endif - for (int ig=0; ignpw; ig++) + for (int ig=0; ig gcar_ig = rho_basis->gcar[ig]; + const ModuleBase::Vector3 gcar_ig = gcar[ig]; std::complex sum_phase = ModuleBase::ZERO; for (int ia=0; iastrucFac(it,ig) = sum_phase;