From 35e3e2ae20542e5ab5cdcd26bdd9aee3c021a1c5 Mon Sep 17 00:00:00 2001 From: abacus_fixer Date: Fri, 15 May 2026 16:26:52 +0800 Subject: [PATCH 1/7] fix data race --- .../source_pw/module_pwdft/structure_factor.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/source/source_pw/module_pwdft/structure_factor.cpp b/source/source_pw/module_pwdft/structure_factor.cpp index c9dac45cc69..252b4b22968 100644 --- a/source/source_pw/module_pwdft/structure_factor.cpp +++ b/source/source_pw/module_pwdft/structure_factor.cpp @@ -85,18 +85,22 @@ void Structure_Factor::setup(const UnitCell* Ucell, const Parallel_Grid& pgrid, { for (int it=0; itntype; it++) { - const int na = Ucell->atoms[it].na; - const ModuleBase::Vector3 * const tau = Ucell->atoms[it].tau.data(); + const int na = Ucell->atoms[it].na; + const ModuleBase::Vector3 * const tau = Ucell->atoms[it].tau.data(); + // Data race fix: cache shared data to local const variables before OpenMP parallel region + // TSan detected race condition when accessing rho_basis->npw and rho_basis->gcar directly + // in parallel loop, even though they are logically read-only + const int npw = rho_basis->npw; + const ModuleBase::Vector3 * const gcar = rho_basis->gcar; #ifdef _OPENMP - #pragma omp parallel for + #pragma omp parallel for #endif - for (int ig=0; ignpw; ig++) + for (int ig=0; ig gcar_ig = rho_basis->gcar[ig]; + const ModuleBase::Vector3 gcar_ig = gcar[ig]; std::complex sum_phase = ModuleBase::ZERO; for (int ia=0; iastrucFac(it,ig) = sum_phase; From 1ff55167b5c3d0943f571427d1570612fe297d74 Mon Sep 17 00:00:00 2001 From: abacus_fixer Date: Fri, 15 May 2026 16:42:26 +0800 Subject: [PATCH 2/7] fix potential data race --- source/source_cell/check_atomic_stru.cpp | 41 +++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/source/source_cell/check_atomic_stru.cpp b/source/source_cell/check_atomic_stru.cpp index c8fea41be48..ce032f81c45 100644 --- a/source/source_cell/check_atomic_stru.cpp +++ b/source/source_cell/check_atomic_stru.cpp @@ -9,10 +9,6 @@ namespace unitcell void check_atomic_stru(UnitCell& ucell, const double& factor) { ModuleBase::timer::start("unitcell", "check_atomic_stru"); - // First we calculate all bond length in the structure, - // and compare with the covalent_bond_length, - // if there has bond length is shorter than covalent_bond_length * factor, - // we think this structure is unreasonable. assert(ucell.ntype > 0); bool all_pass = true; bool no_warning = true; @@ -86,6 +82,15 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) } const double bohr_to_a = ModuleBase::BOHR_TO_A; + // Data race fix: cache shared read-only data to local const references before OpenMP parallel region + // TSan detected race condition when accessing std::vector elements (cell, label) + // and other shared vectors (A, latvec, symbol_covalent_radiuss) in parallel loop. + // Using local const references ensures the compiler treats these as immutable within the parallel region + const std::vector& cell_ = cell; + const std::vector& label_ = label; + const std::vector& A_ = A; + const std::vector& lat_ = latvec; + const std::vector& rad_ = symbol_covalent_radiuss; #pragma omp parallel { std::vector delta_lat(3); @@ -94,13 +99,13 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) { const int it1 = ucell.iat2it[iat]; const int ia1 = ucell.iat2ia[iat]; - const double symbol1_covalent_radius = symbol_covalent_radiuss[it1]; + const double symbol1_covalent_radius = rad_[it1]; double x1 = ucell.atoms[it1].taud[ia1].x; double y1 = ucell.atoms[it1].taud[ia1].y; double z1 = ucell.atoms[it1].taud[ia1].z; for (int it2 = it1; it2 < ntype; it2++) { - double symbol2_covalent_radius = symbol_covalent_radiuss[it2]; + double symbol2_covalent_radius = rad_[it2]; double covalent_length = (symbol1_covalent_radius + symbol2_covalent_radius) / bohr_to_a; const double max_error = covalent_length * max_factor_coef / ucell.lat0; const double max_error_2 = max_error * max_error; @@ -111,17 +116,17 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) double delta_x = ucell.atoms[it2].taud[ia2].x - x1; double delta_y = ucell.atoms[it2].taud[ia2].y - y1; double delta_z = ucell.atoms[it2].taud[ia2].z - z1; - delta_lat[0] = delta_x * latvec[0] + delta_y * latvec[1] + delta_z * latvec[2]; - delta_lat[1] = delta_x * latvec[3] + delta_y * latvec[4] + delta_z * latvec[5]; - delta_lat[2] = delta_x * latvec[6] + delta_y * latvec[7] + delta_z * latvec[8]; + delta_lat[0] = delta_x * lat_[0] + delta_y * lat_[1] + delta_z * lat_[2]; + delta_lat[1] = delta_x * lat_[3] + delta_y * lat_[4] + delta_z * lat_[5]; + delta_lat[2] = delta_x * lat_[6] + delta_y * lat_[7] + delta_z * lat_[8]; for (int i = 0; i < 27; i++) { if ((is_same_atom) && (i == 13)) continue; const int offset = i * 3; - const double part1 = delta_lat[0] + A[offset]; - const double part2 = delta_lat[1] + A[offset + 1]; - const double part3 = delta_lat[2] + A[offset + 2]; + const double part1 = delta_lat[0] + A_[offset]; + const double part2 = delta_lat[1] + A_[offset + 1]; + const double part3 = delta_lat[2] + A_[offset + 2]; const double bond_length = part1 * part1 + part2 * part2 + part3 * part3; const bool flag = bond_length < max_error_2 ? true : false; if (flag) @@ -131,15 +136,15 @@ void check_atomic_stru(UnitCell& ucell, const double& factor) { no_warning = false; all_pass = all_pass && (sqrt_bon < factor_error ? false : true); - errorlog << std::setw(3) << ia1 + 1 << "-th " << label[it1] << ", " << std::setw(3) - << ia2 + 1 << "-th " << label[it2] << cell[i] << std::setprecision(3) + errorlog << std::setw(3) << ia1 + 1 << "-th " << label_[it1] << ", " << std::setw(3) + << ia2 + 1 << "-th " << label_[it2] << cell_[i] << std::setprecision(3) << sqrt_bon << " Bohr (" << sqrt_bon * bohr_to_a << " Angstrom)\n"; } } - } - } // ia2 - } // it2 - } // iat + } + } + } + } } } if (!all_pass || !no_warning) From a3b0622516e39856c643150d108032288a652ae9 Mon Sep 17 00:00:00 2001 From: abacus_fixer Date: Fri, 15 May 2026 17:03:18 +0800 Subject: [PATCH 3/7] fix data race in pw --- .../source_basis/module_pw/pw_gatherscatter.h | 164 ++++++++++-------- .../source_basis/module_pw/pw_transform.cpp | 83 +++++---- 2 files changed, 147 insertions(+), 100 deletions(-) diff --git a/source/source_basis/module_pw/pw_gatherscatter.h b/source/source_basis/module_pw/pw_gatherscatter.h index 24e3671d4b3..904666b7faa 100644 --- a/source/source_basis/module_pw/pw_gatherscatter.h +++ b/source/source_basis/module_pw/pw_gatherscatter.h @@ -1,7 +1,8 @@ #include "pw_basis.h" #include "source_base/global_function.h" #include "source_base/timer.h" -#include "typeinfo" +#include + namespace ModulePW { /** @@ -18,16 +19,18 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const if(this->poolnproc == 1) //In this case nst=nstot, nz = nplane, { + const int nst = this->nst; + const int nz = this->nz; + const int* istot2ixy = this->istot2ixy; #ifdef _OPENMP #pragma omp parallel for #endif - for(int is = 0 ; is < this->nst ; ++is) + for(int is = 0 ; is < nst ; ++is) { - int ixy = this->istot2ixy[is]; - //int ixy = (ixy / fftny)*ny + ixy % fftny; + int ixy = istot2ixy[is]; std::complex *outp = &out[is*nz]; std::complex *inp = &in[ixy*nz]; - for(int iz = 0 ; iz < this->nz ; ++iz) + for(int iz = 0 ; iz < nz ; ++iz) { outp[iz] = inp[iz]; } @@ -38,51 +41,59 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const #ifdef __MPI //change (nplane fftnxy) to (nplane,nstot) // Hence, we can send them at one time. + const int nstot_gps = this->nstot; + const int nplane_gps = this->nplane; + const int* istot2ixy_gps = this->istot2ixy; #ifdef _OPENMP #pragma omp parallel for #endif - for (int istot = 0;istot < nstot; ++istot) - { - int ixy = this->istot2ixy[istot]; - //int ixy = (ixy / fftny)*ny + ixy % fftny; - std::complex *outp = &out[istot*nplane]; - std::complex *inp = &in[ixy*nplane]; - for (int iz = 0; iz < nplane; ++iz) - { - outp[iz] = inp[iz]; - } - } + for (int istot = 0; istot < nstot_gps; ++istot) + { + int ixy = istot2ixy_gps[istot]; + std::complex *outp = &out[istot * nplane_gps]; + std::complex *inp = &in[ixy * nplane_gps]; + for (int iz = 0; iz < nplane_gps; ++iz) + { + outp[iz] = inp[iz]; + } + } //exchange data //(nplane,nstot) to (numz[ip],ns, poolnproc) - if(typeid(T) == typeid(double)) - { - MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world); - } - else if(typeid(T) == typeid(float)) - { - MPI_Alltoallv(out, numr, startr, MPI_COMPLEX, in, numg, startg, MPI_COMPLEX, this->pool_world); - } + if(typeid(T) == typeid(double)) + { + MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world); + } + else if(typeid(T) == typeid(float)) + { + MPI_Alltoallv(out, numr, startr, MPI_COMPLEX, in, numg, startg, MPI_COMPLEX, this->pool_world); + } // change (nz,ns) to (numz[ip],ns, poolnproc) + const int poolnproc_gps = this->poolnproc; + const int nst_gps = this->nst; + const int nz_gps = this->nz; + const int* numz_gps = this->numz; + const int* startg_gps = this->startg; + const int* startz_gps = this->startz; #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int ip = 0; ip < this->poolnproc ;++ip) - { - for (int is = 0; is < this->nst; ++is) - { - int nzip = this->numz[ip]; - std::complex *outp0 = &out[startz[ip]]; - std::complex *inp0 = &in[startg[ip]]; - std::complex *outp = &outp0[is * nz]; + for (int ip = 0; ip < poolnproc_gps ;++ip) + { + for (int is = 0; is < nst_gps; ++is) + { + int nzip = numz_gps[ip]; + std::complex *outp0 = &out[startz_gps[ip]]; + std::complex *inp0 = &in[startg_gps[ip]]; + std::complex *outp = &outp0[is * nz_gps]; std::complex *inp = &inp0[is * nzip ]; - for (int izip = 0; izip < nzip; ++izip) - { - outp[izip] = inp[izip]; - } - } - } + for (int izip = 0; izip < nzip; ++izip) + { + outp[izip] = inp[izip]; + } + } + } #endif //ModuleBase::timer::start(this->classname, "gatherp_scatters"); return; @@ -101,10 +112,14 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const // ModuleBase::timer::start(this->classname, "gathers_scatterp"); if(this->poolnproc == 1) //In this case nrxx=fftnx*fftny*nz, nst = nstot, { + const int nrxx = this->nrxx; + const int nst = this->nst; + const int nz = this->nz; + const int* istot2ixy = this->istot2ixy; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for(int i = 0; i < this->nrxx; ++i) + for(int i = 0; i < nrxx; ++i) { out[i] = std::complex(0, 0); } @@ -112,13 +127,12 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const #ifdef _OPENMP #pragma omp parallel for #endif - for(int is = 0 ; is < this->nst ; ++is) + for(int is = 0 ; is < nst ; ++is) { int ixy = istot2ixy[is]; - //int ixy = (ixy / fftny)*ny + ixy % fftny; std::complex *outp = &out[ixy*nz]; std::complex *inp = &in[is*nz]; - for(int iz = 0 ; iz < this->nz ; ++iz) + for(int iz = 0 ; iz < nz ; ++iz) { outp[iz] = inp[iz]; } @@ -129,57 +143,67 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const #ifdef __MPI // change (nz,ns) to (numz[ip],ns, poolnproc) // Hence, we can send them at one time. + const int poolnproc = this->poolnproc; + const int nst = this->nst; + const int nz = this->nz; + const int* numz = this->numz; + const int* startg = this->startg; + const int* startz = this->startz; #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif - for (int ip = 0; ip < this->poolnproc ;++ip) - { - for (int is = 0; is < this->nst; ++is) - { - int nzip = this->numz[ip]; + for (int ip = 0; ip < poolnproc ;++ip) + { + for (int is = 0; is < nst; ++is) + { + int nzip = numz[ip]; std::complex *outp0 = &out[startg[ip]]; std::complex *inp0 = &in[startz[ip]]; std::complex *outp = &outp0[is * nzip]; std::complex *inp = &inp0[is * nz ]; - for (int izip = 0; izip < nzip; ++izip) - { - outp[izip] = inp[izip]; - } - } - } + for (int izip = 0; izip < nzip; ++izip) + { + outp[izip] = inp[izip]; + } + } + } - //exchange data + //exchange data //(numz[ip],ns, poolnproc) to (nplane,nstot) - if(typeid(T) == typeid(double)) - { - MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world); - } - else if(typeid(T) == typeid(float)) - { - MPI_Alltoallv(out, numg, startg, MPI_COMPLEX, in, numr, startr, MPI_COMPLEX, this->pool_world); - } + if(typeid(T) == typeid(double)) + { + MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world); + } + else if(typeid(T) == typeid(float)) + { + MPI_Alltoallv(out, numg, startg, MPI_COMPLEX, in, numr, startr, MPI_COMPLEX, this->pool_world); + } + const int nrxx_gsp = this->nrxx; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for(int i = 0; i < this->nrxx; ++i) + for(int i = 0; i < nrxx_gsp; ++i) { out[i] = std::complex(0, 0); } //change (nplane,nstot) to (nplane fftnxy) + const int nstot = this->nstot; + const int nplane = this->nplane; + const int* istot2ixy = this->istot2ixy; #ifdef _OPENMP #pragma omp parallel for #endif - for (int istot = 0;istot < nstot; ++istot) - { - int ixy = this->istot2ixy[istot]; + for (int istot = 0;istot < nstot; ++istot) + { + int ixy = istot2ixy[istot]; //int ixy = (ixy / fftny)*ny + ixy % fftny; std::complex *outp = &out[ixy * nplane]; std::complex *inp = &in[istot * nplane]; - for (int iz = 0; iz < nplane; ++iz) - { - outp[iz] = inp[iz]; - } + for (int iz = 0; iz < nplane; ++iz) + { + outp[iz] = inp[iz]; + } } #endif // ModuleBase::timer::start(this->classname, "gathers_scatterp"); diff --git a/source/source_basis/module_pw/pw_transform.cpp b/source/source_basis/module_pw/pw_transform.cpp index 2195517d49d..d0980d33b81 100644 --- a/source/source_basis/module_pw/pw_transform.cpp +++ b/source/source_basis/module_pw/pw_transform.cpp @@ -30,10 +30,14 @@ void PW_Basis::real2recip(const std::complex* in, ModuleBase::timer::start(this->classname, "real2recip"); assert(this->gamma_only == false); + const int nrxx = this->nrxx; + const int npw = this->npw; + const int nxyz = this->nxyz; + const int* ig2isz = this->ig2isz; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { this->fft_bundle.get_auxr_data()[ir] = in[ir]; } @@ -45,24 +49,24 @@ void PW_Basis::real2recip(const std::complex* in, if (add) { - FPTYPE tmpfac = factor / FPTYPE(this->nxyz); + FPTYPE tmpfac = factor / FPTYPE(nxyz); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw; ++ig) { - out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[ig2isz[ig]]; } } else { - FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); + FPTYPE tmpfac = 1.0 / FPTYPE(nxyz); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw; ++ig) { - out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[ig2isz[ig]]; } } ModuleBase::timer::end(this->classname, "real2recip"); @@ -79,13 +83,20 @@ template void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const bool add, const FPTYPE factor) const { ModuleBase::timer::start(this->classname, "real2recip"); + const int nrxx = this->nrxx; + const int npw = this->npw; + const int nxyz = this->nxyz; + const int* ig2isz = this->ig2isz; + const int nx = this->nx; + const int ny = this->ny; + const int nplane = this->nplane; if (this->gamma_only) { - const int npy = this->ny * this->nplane; + const int npy = ny * nplane; #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static) #endif - for (int ix = 0; ix < this->nx; ++ix) + for (int ix = 0; ix < nx; ++ix) { for (int ipy = 0; ipy < npy; ++ipy) { @@ -100,7 +111,7 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { this->fft_bundle.get_auxr_data()[ir] = std::complex(in[ir], 0); } @@ -112,24 +123,24 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex* out, const boo if (add) { - FPTYPE tmpfac = factor / FPTYPE(this->nxyz); + FPTYPE tmpfac = factor / FPTYPE(nxyz); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw; ++ig) { - out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] += tmpfac * this->fft_bundle.get_auxg_data()[ig2isz[ig]]; } } else { - FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz); + FPTYPE tmpfac = 1.0 / FPTYPE(nxyz); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw; ++ig) { - out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[this->ig2isz[ig]]; + out[ig] = tmpfac * this->fft_bundle.get_auxg_data()[ig2isz[ig]]; } } ModuleBase::timer::end(this->classname, "real2recip"); @@ -150,10 +161,15 @@ void PW_Basis::recip2real(const std::complex* in, { ModuleBase::timer::start(this->classname, "recip2real"); assert(this->gamma_only == false); + const int nst = this->nst; + const int nz = this->nz; + const int npw = this->npw; + const int nrxx = this->nrxx; + const int* ig2isz = this->ig2isz; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int i = 0; i < this->nst * this->nz; ++i) + for (int i = 0; i < nst * nz; ++i) { fft_bundle.get_auxg_data()[i] = std::complex(0, 0); } @@ -161,9 +177,9 @@ void PW_Basis::recip2real(const std::complex* in, #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw; ++ig) { - this->fft_bundle.get_auxg_data()[this->ig2isz[ig]] = in[ig]; + this->fft_bundle.get_auxg_data()[ig2isz[ig]] = in[ig]; } this->fft_bundle.fftzbac(fft_bundle.get_auxg_data(), fft_bundle.get_auxg_data()); @@ -176,7 +192,7 @@ void PW_Basis::recip2real(const std::complex* in, #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { out[ir] += factor * this->fft_bundle.get_auxr_data()[ir]; } @@ -186,7 +202,7 @@ void PW_Basis::recip2real(const std::complex* in, #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { out[ir] = this->fft_bundle.get_auxr_data()[ir]; } @@ -205,10 +221,18 @@ template void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const bool add, const FPTYPE factor) const { ModuleBase::timer::start(this->classname, "recip2real"); + const int nst = this->nst; + const int nz = this->nz; + const int npw = this->npw; + const int nrxx = this->nrxx; + const int nx = this->nx; + const int ny = this->ny; + const int nplane = this->nplane; + const int* ig2isz = this->ig2isz; #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int i = 0; i < this->nst * this->nz; ++i) + for (int i = 0; i < nst * nz; ++i) { fft_bundle.get_auxg_data()[i] = std::complex(0, 0); } @@ -216,9 +240,9 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ig = 0; ig < this->npw; ++ig) + for (int ig = 0; ig < npw; ++ig) { - this->fft_bundle.get_auxg_data()[this->ig2isz[ig]] = in[ig]; + this->fft_bundle.get_auxg_data()[ig2isz[ig]] = in[ig]; } this->fft_bundle.fftzbac(fft_bundle.get_auxg_data(), fft_bundle.get_auxg_data()); @@ -228,15 +252,14 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo { this->fft_bundle.fftxyc2r(fft_bundle.get_auxr_data(), fft_bundle.get_rspace_data()); - // r2c in place - const int npy = this->ny * this->nplane; + const int npy = ny * nplane; if (add) { #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static) #endif - for (int ix = 0; ix < this->nx; ++ix) + for (int ix = 0; ix < nx; ++ix) { for (int ipy = 0; ipy < npy; ++ipy) { @@ -249,7 +272,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static) #endif - for (int ix = 0; ix < this->nx; ++ix) + for (int ix = 0; ix < nx; ++ix) { for (int ipy = 0; ipy < npy; ++ipy) { @@ -266,7 +289,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { out[ir] += factor * this->fft_bundle.get_auxr_data()[ir].real(); } @@ -276,7 +299,7 @@ void PW_Basis::recip2real(const std::complex* in, FPTYPE* out, const boo #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif - for (int ir = 0; ir < this->nrxx; ++ir) + for (int ir = 0; ir < nrxx; ++ir) { out[ir] = this->fft_bundle.get_auxr_data()[ir].real(); } From 5087b2d11852aa0dc52a5800adb59eb944f2492e Mon Sep 17 00:00:00 2001 From: abacus_fixer Date: Fri, 15 May 2026 17:12:39 +0800 Subject: [PATCH 4/7] fix potential data race --- source/source_estate/module_charge/charge.cpp | 108 ++++++++++-------- 1 file changed, 62 insertions(+), 46 deletions(-) diff --git a/source/source_estate/module_charge/charge.cpp b/source/source_estate/module_charge/charge.cpp index 62a9e702f90..94fa0fd4548 100644 --- a/source/source_estate/module_charge/charge.cpp +++ b/source/source_estate/module_charge/charge.cpp @@ -359,7 +359,9 @@ void Charge::atomic_rho(const int spin_number_need, gstart = 1; } if (PARAM.inp.test_charge > 0) + { std::cout << "\n |G|=0 term done." << std::endl; + } //---------------------------------------------------------- // Here we compute the G<>0 term // But if in parallel case @@ -370,14 +372,18 @@ void Charge::atomic_rho(const int spin_number_need, #pragma omp parallel { #endif - std::vector rho1d(ucell.meshx); + const int ngg = this->rhopw->ngg; + const double* gg_uniq = this->rhopw->gg_uniq; + const int meshx = ucell.meshx; + const double tpiba = ucell.tpiba; + std::vector rho1d(meshx); #ifdef _OPENMP #pragma omp for #endif - for (int igg = gstart; igg < this->rhopw->ngg; ++igg) + for (int igg = gstart; igg < ngg; ++igg) { - const double gx = sqrt(this->rhopw->gg_uniq[igg]) * ucell.tpiba; + const double gx = sqrt(gg_uniq[igg]) * tpiba; for (int ir = 0; ir < mesh; ir++) { if (atom->ncpp.r[ir] < 1.0e-8) @@ -397,7 +403,9 @@ void Charge::atomic_rho(const int spin_number_need, #endif { if (PARAM.inp.test_charge > 0) + { std::cout << " |G|>0 term done." << std::endl; + } } //---------------------------------------------------------- // EXPLAIN : Complete the transfer of rho from real space to @@ -406,7 +414,7 @@ void Charge::atomic_rho(const int spin_number_need, #ifdef _OPENMP #pragma omp for #endif - for (int igg = 0; igg < this->rhopw->ngg; igg++) + for (int igg = 0; igg < ngg; igg++) { rho_lgl[igg] /= omega; } @@ -420,12 +428,14 @@ void Charge::atomic_rho(const int spin_number_need, //---------------------------------------------------------- if (spin_number_need == 1) { + const int npw = this->rhopw->npw; + const int* ig2igg = this->rhopw->ig2igg; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - rho_g3d(0, ig) += strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]]; + rho_g3d(0, ig) += strucFac(it, ig) * rho_lgl[ig2igg[ig]]; } } // mohan add 2011-06-14, initialize the charge density according to each atom @@ -433,14 +443,18 @@ void Charge::atomic_rho(const int spin_number_need, { if (startmag_type == 1) { + const int npw = this->rhopw->npw; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; + const double start_mag_it = ucell.magnet.start_mag[it]; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const std::complex swap = strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]]; - const double up = 0.5 * (1 + ucell.magnet.start_mag[it] / atom->ncpp.zv); - const double dw = 0.5 * (1 - ucell.magnet.start_mag[it] / atom->ncpp.zv); + const std::complex swap = strucFac(it, ig) * rho_lgl[ig2igg[ig]]; + const double up = 0.5 * (1 + start_mag_it / zv); + const double dw = 0.5 * (1 - start_mag_it / zv); rho_g3d(0, ig) += swap * up; rho_g3d(1, ig) += swap * dw; } @@ -449,25 +463,24 @@ void Charge::atomic_rho(const int spin_number_need, else if (startmag_type == 2) { std::complex ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI; + const int npw = this->rhopw->npw; + const ModuleBase::Vector3* gcar = this->rhopw->gcar; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; for (int ia = 0; ia < atom->na; ia++) { - // const double up = 0.5 * ( 1 + atom->mag[ia] ); - // const double dw = 0.5 * ( 1 - atom->mag[ia] ); const double up = 0.5 * (1 + atom->mag[ia] / atom->ncpp.zv); const double dw = 0.5 * (1 - atom->mag[ia] / atom->ncpp.zv); - // std::cout << " atom " << ia << " up=" << up << " dw=" << dw << std::endl; + const double tau_x = atom->tau[ia].x; + const double tau_y = atom->tau[ia].y; + const double tau_z = atom->tau[ia].z; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const double Gtau = this->rhopw->gcar[ig][0] * atom->tau[ia].x - + this->rhopw->gcar[ig][1] * atom->tau[ia].y - + this->rhopw->gcar[ig][2] * atom->tau[ia].z; - - std::complex swap - = ModuleBase::libm::exp(ci_tpi * Gtau) * rho_lgl[this->rhopw->ig2igg[ig]]; - + const double Gtau = gcar[ig][0] * tau_x + gcar[ig][1] * tau_y + gcar[ig][2] * tau_z; + std::complex swap = ModuleBase::libm::exp(ci_tpi * Gtau) * rho_lgl[ig2igg[ig]]; rho_g3d(0, ig) += swap * up; rho_g3d(1, ig) += swap * dw; } @@ -481,37 +494,42 @@ void Charge::atomic_rho(const int spin_number_need, { double sin_a1, sin_a2, cos_a1, cos_a2; if (PARAM.globalv.domag) - { // will not be used now, will be deleted later + { ModuleBase::libm::sincos(atom->angle1[0], &sin_a1, &cos_a1); ModuleBase::libm::sincos(atom->angle2[0], &sin_a2, &cos_a2); } + const int npw = this->rhopw->npw; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; + const double start_mag_it = ucell.magnet.start_mag[it]; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const std::complex swap = strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]]; + const std::complex swap = strucFac(it, ig) * rho_lgl[ig2igg[ig]]; rho_g3d(0, ig) += swap; if (PARAM.globalv.domag) - { // will not be used now, will be deleted later - rho_g3d(1, ig) - += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * sin_a1 * cos_a2; - rho_g3d(2, ig) - += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * sin_a1 * sin_a2; - rho_g3d(3, ig) - += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * cos_a1; + { + rho_g3d(1, ig) += swap * (start_mag_it / zv) * sin_a1 * cos_a2; + rho_g3d(2, ig) += swap * (start_mag_it / zv) * sin_a1 * sin_a2; + rho_g3d(3, ig) += swap * (start_mag_it / zv) * cos_a1; } else if (PARAM.globalv.domag_z) { rho_g3d(1, ig) = 0.0; rho_g3d(2, ig) = 0.0; - rho_g3d(3, ig) += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv); + rho_g3d(3, ig) += swap * (start_mag_it / zv); } } } else if (startmag_type == 2) - { // zdy-warning-not-available + { std::complex ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI; + const int npw = this->rhopw->npw; + const ModuleBase::Vector3* gcar = this->rhopw->gcar; + const int* ig2igg = this->rhopw->ig2igg; + const double zv = atom->ncpp.zv; for (int ia = 0; ia < atom->na; ia++) { double sin_a1, sin_a2, cos_a1, cos_a2; @@ -523,29 +541,27 @@ void Charge::atomic_rho(const int spin_number_need, { ModuleBase::libm::sincos(atom->angle2[ia], &sin_a2, &cos_a2); } + const double mag_ia = atom->mag[ia]; + const double tau_x = atom->tau[ia].x; + const double tau_y = atom->tau[ia].y; + const double tau_z = atom->tau[ia].z; #ifdef _OPENMP #pragma omp parallel for #endif - for (int ig = 0; ig < this->rhopw->npw; ig++) + for (int ig = 0; ig < npw; ig++) { - const double Gtau = this->rhopw->gcar[ig][0] * atom->tau[ia].x - + this->rhopw->gcar[ig][1] * atom->tau[ia].y - + this->rhopw->gcar[ig][2] * atom->tau[ia].z; - - std::complex swap = exp(ci_tpi * Gtau) * rho_lgl[this->rhopw->ig2igg[ig]]; - - // calculate rho_total + const double Gtau = gcar[ig][0] * tau_x + gcar[ig][1] * tau_y + gcar[ig][2] * tau_z; + std::complex swap = exp(ci_tpi * Gtau) * rho_lgl[ig2igg[ig]]; + const double mag_factor = mag_ia / zv; rho_g3d(0, ig) += swap; - // calculate mag_z if (PARAM.globalv.domag || PARAM.globalv.domag_z) { - rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * cos_a1; + rho_g3d(3, ig) += swap * mag_factor * cos_a1; } - // calculate mag_x and mag_y if (PARAM.globalv.domag) { - rho_g3d(1, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * sin_a1 * cos_a2; - rho_g3d(2, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * sin_a1 * sin_a2; + rho_g3d(1, ig) += swap * mag_factor * sin_a1 * cos_a2; + rho_g3d(2, ig) += swap * mag_factor * sin_a1 * sin_a2; } else { From 1755bfbc577d1bdbb1b57c197406fa3dd2a5a199 Mon Sep 17 00:00:00 2001 From: abacus_fixer Date: Fri, 15 May 2026 17:16:26 +0800 Subject: [PATCH 5/7] update format --- source/source_base/math_integral.cpp | 44 ++++++++++++++-------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/source/source_base/math_integral.cpp b/source/source_base/math_integral.cpp index 0dd8434bc37..5b4e7cc125e 100644 --- a/source/source_base/math_integral.cpp +++ b/source/source_base/math_integral.cpp @@ -37,7 +37,7 @@ void Integral::Simpson_Integral // simpson's rule integrator for function stored on the // radial logarithmic mesh - // routine assumes that mesh is an odd number so run check + // routine assumes that mesh is an odd number so run check if (mesh % 2 == 0) { std::cout << "\n error in subroutine simpson "; @@ -83,21 +83,21 @@ void Integral::Simpson_Integral */ // simpson's rule integrator for function stored on the // radial logarithmic mesh - // routine assumes that mesh is an odd number so run check + // routine assumes that mesh is an odd number so run check assert(mesh&1); asum = 0.00; - const size_t end = mesh-2; + const size_t end = mesh-2; for( size_t i=1; i!=end; i+=2 ) { - const double f1 = func[i]*rab[i]; - asum += f1 + f1 + func[i+1]*rab[i+1]; + const double f1 = func[i]*rab[i]; + asum += f1 + f1 + func[i+1]*rab[i+1]; } - const double f1 = func[mesh-2]*rab[mesh-2]; - asum += f1+f1; - asum += asum; - asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1]; - asum /= 3.0; + const double f1 = func[mesh-2]*rab[mesh-2]; + asum += f1+f1; + asum += asum; + asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1]; + asum /= 3.0; return; }// end subroutine simpson @@ -124,21 +124,21 @@ void Integral::Simpson_Integral */ // simpson's rule integrator for function stored on the // radial logarithmic mesh - // routine assumes that mesh is an odd number so run check + // routine assumes that mesh is an odd number so run check assert(mesh&1); asum = 0.00; - const size_t end = mesh-2; + const size_t end = mesh-2; for(size_t i=1; i!=end; i+=2 ) { - const double f1 = func[i]; - asum += f1 + f1 + func[i+1]; + const double f1 = func[i]; + asum += f1 + f1 + func[i+1]; } - const double f1 = func[mesh-2]; - asum += f1+f1; - asum += asum; - asum += func[0] + func[mesh-1]; - asum *= dr/3.0; + const double f1 = func[mesh-2]; + asum += f1+f1; + asum += asum; + asum += func[0] + func[mesh-1]; + asum *= dr/3.0; return; }// end subroutine simpson @@ -220,10 +220,10 @@ void Integral::Simpson_Integral_alltoinf const double asum_all = asum[mesh-1]; for (int i = 0;i < mesh; ++i) - { + { asum[i] = asum_all - asum[i]; - } - return; + } + return; } double Integral::simpson(const int n, const double* const f, const double dx) From 23d40e3dd8f4812fe5a6db86e79b7efe8cf8ff29 Mon Sep 17 00:00:00 2001 From: abacus_fixer Date: Fri, 15 May 2026 17:28:41 +0800 Subject: [PATCH 6/7] fix data race in fft --- source/source_base/module_fft/fft_cpu.cpp | 114 +++++++++++++++------- 1 file changed, 80 insertions(+), 34 deletions(-) diff --git a/source/source_base/module_fft/fft_cpu.cpp b/source/source_base/module_fft/fft_cpu.cpp index f50f6e9e868..c486c31642a 100644 --- a/source/source_base/module_fft/fft_cpu.cpp +++ b/source/source_base/module_fft/fft_cpu.cpp @@ -344,60 +344,86 @@ void FFT_CPU::clear() template <> void FFT_CPU::fftxyfor(std::complex* in, std::complex* out) const { - int npy = this->nplane * this->ny; + const int npy = this->nplane * this->ny; + const int nx = this->nx; + const int lixy = this->lixy; + const int rixy = this->rixy; + const int nplane = this->nplane; + const fftw_plan planyfor = this->planyfor; + const fftw_plan planxfor1 = this->planxfor1; + const fftw_plan planxfor2 = this->planxfor2; if (this->xprime) { - fftw_execute_dft(this->planxfor1, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxfor1, (fftw_complex*)in, (fftw_complex*)out); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy + 1; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); } +#ifdef _OPENMP #pragma omp parallel for - for (int i = rixy; i < this->nx; ++i) +#endif + for (int i = rixy; i < nx; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); } } else { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); } - fftw_execute_dft(this->planxfor1, (fftw_complex*)in, (fftw_complex*)out); - fftw_execute_dft(this->planxfor2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]); + fftw_execute_dft(planxfor1, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxfor2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]); } } template <> void FFT_CPU::fftxybac(std::complex* in,std::complex* out) const { - int npy = this->nplane * this->ny; + const int npy = this->nplane * this->ny; + const int nx = this->nx; + const int lixy = this->lixy; + const int rixy = this->rixy; + const int nplane = this->nplane; + const fftw_plan planybac = this->planybac; + const fftw_plan planxbac1 = this->planxbac1; + const fftw_plan planxbac2 = this->planxbac2; if (this->xprime) { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy + 1; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); } +#ifdef _OPENMP #pragma omp parallel for - for (int i = rixy; i < this->nx; ++i) +#endif + for (int i = rixy; i < nx; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); } - fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxbac1, (fftw_complex*)in, (fftw_complex*)out); } else { - fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)out); - fftw_execute_dft(this->planxbac2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]); + fftw_execute_dft(planxbac1, (fftw_complex*)in, (fftw_complex*)out); + fftw_execute_dft(planxbac2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]); } } } @@ -417,47 +443,67 @@ void FFT_CPU::fftzbac(std::complex* in, std::complex* ou template <> void FFT_CPU::fftxyr2c(double* in, std::complex* out) const { - int npy = this->nplane * this->ny; + const int npy = this->nplane * this->ny; + const int nx = this->nx; + const int lixy = this->lixy; + const fftw_plan planxr2c = this->planxr2c; + const fftw_plan planyfor = this->planyfor; + const fftw_plan planyr2c = this->planyr2c; + const fftw_plan planxfor1 = this->planxfor1; if (this->xprime) { - fftw_execute_dft_r2c(this->planxr2c, in, (fftw_complex*)out); + fftw_execute_dft_r2c(planxr2c, in, (fftw_complex*)out); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy + 1; ++i) { - fftw_execute_dft(this->planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft(planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]); } } else { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx; ++i) { - fftw_execute_dft_r2c(this->planyr2c, &in[i * npy], (fftw_complex*)&out[i * npy]); + fftw_execute_dft_r2c(planyr2c, &in[i * npy], (fftw_complex*)&out[i * npy]); } - fftw_execute_dft(this->planxfor1, (fftw_complex*)out, (fftw_complex*)out); + fftw_execute_dft(planxfor1, (fftw_complex*)out, (fftw_complex*)out); } } template <> void FFT_CPU::fftxyc2r(std::complex *in,double *out) const { - int npy = this->nplane * this->ny; + const int npy = this->nplane * this->ny; + const int nx = this->nx; + const int lixy = this->lixy; + const fftw_plan planybac = this->planybac; + const fftw_plan planxc2r = this->planxc2r; + const fftw_plan planxbac1 = this->planxbac1; + const fftw_plan planyc2r = this->planyc2r; if (this->xprime) { +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->lixy + 1; ++i) +#endif + for (int i = 0; i < lixy + 1; ++i) { - fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&in[i * npy]); + fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&in[i * npy]); } - fftw_execute_dft_c2r(this->planxc2r, (fftw_complex*)in, out); + fftw_execute_dft_c2r(planxc2r, (fftw_complex*)in, out); } else { - fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)in); + fftw_execute_dft(planxbac1, (fftw_complex*)in, (fftw_complex*)in); +#ifdef _OPENMP #pragma omp parallel for - for (int i = 0; i < this->nx; ++i) +#endif + for (int i = 0; i < nx; ++i) { - fftw_execute_dft_c2r(this->planyc2r, (fftw_complex*)&in[i * npy], &out[i * npy]); + fftw_execute_dft_c2r(planyc2r, (fftw_complex*)&in[i * npy], &out[i * npy]); } } } From 76948b97cc63d1d2ec12fa8d1246c802dd8f6628 Mon Sep 17 00:00:00 2001 From: abacus_fixer Date: Sat, 16 May 2026 22:06:56 +0800 Subject: [PATCH 7/7] fix --- source/source_basis/module_pw/pw_gatherscatter.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/source/source_basis/module_pw/pw_gatherscatter.h b/source/source_basis/module_pw/pw_gatherscatter.h index 904666b7faa..ae42f15d624 100644 --- a/source/source_basis/module_pw/pw_gatherscatter.h +++ b/source/source_basis/module_pw/pw_gatherscatter.h @@ -60,6 +60,12 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const //exchange data //(nplane,nstot) to (numz[ip],ns, poolnproc) + // OMP barrier: Ensure all OMP threads have finished writing to buffers + // before MPI communication reads from them. This prevents data race + // between OMP parallel write (e.g., FFT calculation) and MPI read. +#ifdef _OPENMP +#pragma omp barrier +#endif if(typeid(T) == typeid(double)) { MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world); @@ -170,6 +176,12 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const //exchange data //(numz[ip],ns, poolnproc) to (nplane,nstot) + // OMP barrier: Ensure all OMP threads have finished writing to buffers + // before MPI communication reads from them. This prevents data race + // between OMP parallel write (e.g., FFT calculation) and MPI read. +#ifdef _OPENMP +#pragma omp barrier +#endif if(typeid(T) == typeid(double)) { MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world);