From 35e3e2ae20542e5ab5cdcd26bdd9aee3c021a1c5 Mon Sep 17 00:00:00 2001
From: abacus_fixer <mohanchen@pku.eud.cn>
Date: Fri, 15 May 2026 16:26:52 +0800
Subject: [PATCH 1/7] fix data race

---
 .../source_pw/module_pwdft/structure_factor.cpp  | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/source/source_pw/module_pwdft/structure_factor.cpp b/source/source_pw/module_pwdft/structure_factor.cpp
index c9dac45cc69..252b4b22968 100644
--- a/source/source_pw/module_pwdft/structure_factor.cpp
+++ b/source/source_pw/module_pwdft/structure_factor.cpp
@@ -85,18 +85,22 @@ void Structure_Factor::setup(const UnitCell* Ucell, const Parallel_Grid& pgrid,
     {
         for (int it=0; it<Ucell->ntype; it++)
         {
-	    	const int na = Ucell->atoms[it].na;
-	    	const ModuleBase::Vector3<double> * const tau = Ucell->atoms[it].tau.data();
+            const int na = Ucell->atoms[it].na;
+            const ModuleBase::Vector3<double> * const tau = Ucell->atoms[it].tau.data();
+            // Data race fix: cache shared data to local const variables before OpenMP parallel region
+            // TSan detected race condition when accessing rho_basis->npw and rho_basis->gcar directly
+            // in parallel loop, even though they are logically read-only
+            const int npw = rho_basis->npw;
+            const ModuleBase::Vector3<double> * const gcar = rho_basis->gcar;
 #ifdef _OPENMP
-		    #pragma omp parallel for
+            #pragma omp parallel for
 #endif
-            for (int ig=0; ig<rho_basis->npw; ig++)
+            for (int ig=0; ig<npw; ig++)
             {
-		    	const ModuleBase::Vector3<double> gcar_ig = rho_basis->gcar[ig];
+                const ModuleBase::Vector3<double> gcar_ig = gcar[ig];
                 std::complex<double> sum_phase = ModuleBase::ZERO;
                 for (int ia=0; ia<na; ia++)
                 {
-                    // e^{-i G*tau}
                     sum_phase += ModuleBase::libm::exp( ci_tpi * (gcar_ig * tau[ia]) );
                 }
                 this->strucFac(it,ig) = sum_phase;

From 1ff55167b5c3d0943f571427d1570612fe297d74 Mon Sep 17 00:00:00 2001
From: abacus_fixer <mohanchen@pku.eud.cn>
Date: Fri, 15 May 2026 16:42:26 +0800
Subject: [PATCH 2/7] fix potential data race

---
 source/source_cell/check_atomic_stru.cpp | 41 +++++++++++++-----------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/source/source_cell/check_atomic_stru.cpp b/source/source_cell/check_atomic_stru.cpp
index c8fea41be48..ce032f81c45 100644
--- a/source/source_cell/check_atomic_stru.cpp
+++ b/source/source_cell/check_atomic_stru.cpp
@@ -9,10 +9,6 @@ namespace unitcell
 void check_atomic_stru(UnitCell& ucell, const double& factor)
 {
     ModuleBase::timer::start("unitcell", "check_atomic_stru");
-    // First we calculate all bond length in the structure,
-    // and compare with the covalent_bond_length,
-    // if there has bond length is shorter than covalent_bond_length * factor,
-    // we think this structure is unreasonable.
     assert(ucell.ntype > 0);
     bool all_pass = true;
     bool no_warning = true;
@@ -86,6 +82,15 @@ void check_atomic_stru(UnitCell& ucell, const double& factor)
         }
 
         const double bohr_to_a = ModuleBase::BOHR_TO_A;
+        // Data race fix: cache shared read-only data to local const references before OpenMP parallel region
+        // TSan detected race condition when accessing std::vector<std::string> elements (cell, label)
+        // and other shared vectors (A, latvec, symbol_covalent_radiuss) in parallel loop.
+        // Using local const references ensures the compiler treats these as immutable within the parallel region
+        const std::vector<std::string>& cell_ = cell;
+        const std::vector<std::string>& label_ = label;
+        const std::vector<double>& A_ = A;
+        const std::vector<double>& lat_ = latvec;
+        const std::vector<double>& rad_ = symbol_covalent_radiuss;
 #pragma omp parallel
         {
             std::vector<double> delta_lat(3);
@@ -94,13 +99,13 @@ void check_atomic_stru(UnitCell& ucell, const double& factor)
             {
                 const int it1 = ucell.iat2it[iat];
                 const int ia1 = ucell.iat2ia[iat];
-                const double symbol1_covalent_radius = symbol_covalent_radiuss[it1];
+                const double symbol1_covalent_radius = rad_[it1];
                 double x1 = ucell.atoms[it1].taud[ia1].x;
                 double y1 = ucell.atoms[it1].taud[ia1].y;
                 double z1 = ucell.atoms[it1].taud[ia1].z;
                 for (int it2 = it1; it2 < ntype; it2++)
                 {
-                    double symbol2_covalent_radius = symbol_covalent_radiuss[it2];
+                    double symbol2_covalent_radius = rad_[it2];
                     double covalent_length = (symbol1_covalent_radius + symbol2_covalent_radius) / bohr_to_a;
                     const double max_error = covalent_length * max_factor_coef / ucell.lat0;
                     const double max_error_2 = max_error * max_error;
@@ -111,17 +116,17 @@ void check_atomic_stru(UnitCell& ucell, const double& factor)
                         double delta_x = ucell.atoms[it2].taud[ia2].x - x1;
                         double delta_y = ucell.atoms[it2].taud[ia2].y - y1;
                         double delta_z = ucell.atoms[it2].taud[ia2].z - z1;
-                        delta_lat[0] = delta_x * latvec[0] + delta_y * latvec[1] + delta_z * latvec[2];
-                        delta_lat[1] = delta_x * latvec[3] + delta_y * latvec[4] + delta_z * latvec[5];
-                        delta_lat[2] = delta_x * latvec[6] + delta_y * latvec[7] + delta_z * latvec[8];
+                        delta_lat[0] = delta_x * lat_[0] + delta_y * lat_[1] + delta_z * lat_[2];
+                        delta_lat[1] = delta_x * lat_[3] + delta_y * lat_[4] + delta_z * lat_[5];
+                        delta_lat[2] = delta_x * lat_[6] + delta_y * lat_[7] + delta_z * lat_[8];
                         for (int i = 0; i < 27; i++)
                         {
                             if ((is_same_atom) && (i == 13))
                                 continue;
                             const int offset = i * 3;
-                            const double part1 = delta_lat[0] + A[offset];
-                            const double part2 = delta_lat[1] + A[offset + 1];
-                            const double part3 = delta_lat[2] + A[offset + 2];
+                            const double part1 = delta_lat[0] + A_[offset];
+                            const double part2 = delta_lat[1] + A_[offset + 1];
+                            const double part3 = delta_lat[2] + A_[offset + 2];
                             const double bond_length = part1 * part1 + part2 * part2 + part3 * part3;
                             const bool flag = bond_length < max_error_2 ? true : false;
                             if (flag)
@@ -131,15 +136,15 @@ void check_atomic_stru(UnitCell& ucell, const double& factor)
                                 {
                                     no_warning = false;
                                     all_pass = all_pass && (sqrt_bon < factor_error ? false : true);
-                                    errorlog << std::setw(3) << ia1 + 1 << "-th " << label[it1] << ", " << std::setw(3)
-                                             << ia2 + 1 << "-th " << label[it2] << cell[i] << std::setprecision(3)
+                                    errorlog << std::setw(3) << ia1 + 1 << "-th " << label_[it1] << ", " << std::setw(3)
+                                             << ia2 + 1 << "-th " << label_[it2] << cell_[i] << std::setprecision(3)
                                              << sqrt_bon << " Bohr (" << sqrt_bon * bohr_to_a << " Angstrom)\n";
                                 }
                             }
-                        } 
-                    } // ia2
-                } // it2
-            } // iat
+                        }
+                    }
+                }
+            }
         }
     }
     if (!all_pass || !no_warning)

From a3b0622516e39856c643150d108032288a652ae9 Mon Sep 17 00:00:00 2001
From: abacus_fixer <mohanchen@pku.eud.cn>
Date: Fri, 15 May 2026 17:03:18 +0800
Subject: [PATCH 3/7] fix data race in pw

---
 .../source_basis/module_pw/pw_gatherscatter.h | 164 ++++++++++--------
 .../source_basis/module_pw/pw_transform.cpp   |  83 +++++----
 2 files changed, 147 insertions(+), 100 deletions(-)

diff --git a/source/source_basis/module_pw/pw_gatherscatter.h b/source/source_basis/module_pw/pw_gatherscatter.h
index 24e3671d4b3..904666b7faa 100644
--- a/source/source_basis/module_pw/pw_gatherscatter.h
+++ b/source/source_basis/module_pw/pw_gatherscatter.h
@@ -1,7 +1,8 @@
 #include "pw_basis.h"
 #include "source_base/global_function.h"
 #include "source_base/timer.h"
-#include "typeinfo"
+#include <typeinfo>
+
 namespace ModulePW
 {
 /**
@@ -18,16 +19,18 @@ void PW_Basis::gatherp_scatters(std::complex<T>* in, std::complex<T>* out) const
     
     if(this->poolnproc == 1) //In this case nst=nstot, nz = nplane, 
     {
+        const int nst = this->nst;
+        const int nz = this->nz;
+        const int* istot2ixy = this->istot2ixy;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-        for(int is = 0 ; is < this->nst ; ++is)
+        for(int is = 0 ; is < nst ; ++is)
         {
-            int ixy = this->istot2ixy[is];
-            //int ixy = (ixy / fftny)*ny + ixy % fftny;
+            int ixy = istot2ixy[is];
             std::complex<T> *outp = &out[is*nz];
             std::complex<T> *inp = &in[ixy*nz];
-            for(int iz = 0 ; iz < this->nz ; ++iz)
+            for(int iz = 0 ; iz < nz ; ++iz)
             {
                 outp[iz] = inp[iz];
             }
@@ -38,51 +41,59 @@ void PW_Basis::gatherp_scatters(std::complex<T>* in, std::complex<T>* out) const
 #ifdef __MPI
     //change (nplane fftnxy) to (nplane,nstot)
     // Hence, we can send them at one time.
+    const int nstot_gps = this->nstot;
+    const int nplane_gps = this->nplane;
+    const int* istot2ixy_gps = this->istot2ixy;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-	for (int istot = 0;istot < nstot; ++istot)
-	{
-		int ixy = this->istot2ixy[istot];
-        //int ixy = (ixy / fftny)*ny + ixy % fftny;
-        std::complex<T> *outp = &out[istot*nplane];
-        std::complex<T> *inp = &in[ixy*nplane];
-		for (int iz = 0; iz < nplane; ++iz)
-		{
-			outp[iz] = inp[iz];
-		}
-	}
+    for (int istot = 0; istot < nstot_gps; ++istot)
+    {
+        int ixy = istot2ixy_gps[istot];
+        std::complex<T> *outp = &out[istot * nplane_gps];
+        std::complex<T> *inp = &in[ixy * nplane_gps];
+        for (int iz = 0; iz < nplane_gps; ++iz)
+        {
+            outp[iz] = inp[iz];
+        }
+    }
 
     //exchange data
     //(nplane,nstot) to (numz[ip],ns, poolnproc)
-	if(typeid(T) == typeid(double))
-	{
-		MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world);
-	}
-	else if(typeid(T) == typeid(float))
-	{
-		MPI_Alltoallv(out, numr, startr, MPI_COMPLEX, in, numg, startg, MPI_COMPLEX, this->pool_world);
-	}
+    if(typeid(T) == typeid(double))
+    {
+        MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world);
+    }
+    else if(typeid(T) == typeid(float))
+    {
+        MPI_Alltoallv(out, numr, startr, MPI_COMPLEX, in, numg, startg, MPI_COMPLEX, this->pool_world);
+    }
 
     // change (nz,ns) to (numz[ip],ns, poolnproc)
+    const int poolnproc_gps = this->poolnproc;
+    const int nst_gps = this->nst;
+    const int nz_gps = this->nz;
+    const int* numz_gps = this->numz;
+    const int* startg_gps = this->startg;
+    const int* startz_gps = this->startz;
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
-    for (int ip = 0; ip < this->poolnproc ;++ip)
-	{
-		for (int is = 0; is < this->nst; ++is)
-		{
-            int nzip = this->numz[ip];
-            std::complex<T> *outp0 = &out[startz[ip]];
-            std::complex<T> *inp0 = &in[startg[ip]];
-            std::complex<T> *outp = &outp0[is * nz];
+    for (int ip = 0; ip < poolnproc_gps ;++ip)
+    {
+        for (int is = 0; is < nst_gps; ++is)
+        {
+            int nzip = numz_gps[ip];
+            std::complex<T> *outp0 = &out[startz_gps[ip]];
+            std::complex<T> *inp0 = &in[startg_gps[ip]];
+            std::complex<T> *outp = &outp0[is * nz_gps];
             std::complex<T> *inp = &inp0[is * nzip ];
-			for (int izip = 0; izip < nzip; ++izip)
-			{
-				outp[izip] = inp[izip];
-			}
-		}
-	}
+            for (int izip = 0; izip < nzip; ++izip)
+            {
+                outp[izip] = inp[izip];
+            }
+        }
+    }
 #endif
     //ModuleBase::timer::start(this->classname, "gatherp_scatters");
     return;
@@ -101,10 +112,14 @@ void PW_Basis::gathers_scatterp(std::complex<T>* in, std::complex<T>* out) const
     // ModuleBase::timer::start(this->classname, "gathers_scatterp");
     if(this->poolnproc == 1) //In this case nrxx=fftnx*fftny*nz, nst = nstot, 
     {
+        const int nrxx = this->nrxx;
+        const int nst = this->nst;
+        const int nz = this->nz;
+        const int* istot2ixy = this->istot2ixy;
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for(int i = 0; i < this->nrxx; ++i)
+        for(int i = 0; i < nrxx; ++i)
         {
             out[i] = std::complex<T>(0, 0);
         }
@@ -112,13 +127,12 @@ void PW_Basis::gathers_scatterp(std::complex<T>* in, std::complex<T>* out) const
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-        for(int is = 0 ; is < this->nst ; ++is)
+        for(int is = 0 ; is < nst ; ++is)
         {
             int ixy = istot2ixy[is];
-            //int ixy = (ixy / fftny)*ny + ixy % fftny;
             std::complex<T> *outp = &out[ixy*nz];
             std::complex<T> *inp = &in[is*nz];
-            for(int iz = 0 ; iz < this->nz ; ++iz)
+            for(int iz = 0 ; iz < nz ; ++iz)
             {
                 outp[iz] = inp[iz];
             }
@@ -129,57 +143,67 @@ void PW_Basis::gathers_scatterp(std::complex<T>* in, std::complex<T>* out) const
 #ifdef __MPI
     // change (nz,ns) to (numz[ip],ns, poolnproc)
     // Hence, we can send them at one time. 
+    const int poolnproc = this->poolnproc;
+    const int nst = this->nst;
+    const int nz = this->nz;
+    const int* numz = this->numz;
+    const int* startg = this->startg;
+    const int* startz = this->startz;
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
-    for (int ip = 0; ip < this->poolnproc ;++ip)
-	{
-		for (int is = 0; is < this->nst; ++is)
-		{
-            int nzip = this->numz[ip];
+    for (int ip = 0; ip < poolnproc ;++ip)
+    {
+        for (int is = 0; is < nst; ++is)
+        {
+            int nzip = numz[ip];
             std::complex<T> *outp0 = &out[startg[ip]];
             std::complex<T> *inp0 = &in[startz[ip]];
             std::complex<T> *outp = &outp0[is * nzip];
             std::complex<T> *inp = &inp0[is * nz ];
-			for (int izip = 0; izip < nzip; ++izip)
-			{
-				outp[izip] = inp[izip];
-			}
-		}
-	}
+            for (int izip = 0; izip < nzip; ++izip)
+            {
+                outp[izip] = inp[izip];
+            }
+        }
+    }
 
-	//exchange data
+    //exchange data
     //(numz[ip],ns, poolnproc) to (nplane,nstot)
-	if(typeid(T) == typeid(double))
-	{
-		MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world);
-	}
-	else if(typeid(T) == typeid(float))
-	{
-		MPI_Alltoallv(out, numg, startg, MPI_COMPLEX, in, numr, startr, MPI_COMPLEX, this->pool_world);
-	}
+    if(typeid(T) == typeid(double))
+    {
+        MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world);
+    }
+    else if(typeid(T) == typeid(float))
+    {
+        MPI_Alltoallv(out, numg, startg, MPI_COMPLEX, in, numr, startr, MPI_COMPLEX, this->pool_world);
+    }
 
+    const int nrxx_gsp = this->nrxx;
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for(int i = 0; i < this->nrxx; ++i)
+    for(int i = 0; i < nrxx_gsp; ++i)
     {
         out[i] = std::complex<T>(0, 0);
     }
     //change (nplane,nstot) to (nplane fftnxy)
+    const int nstot = this->nstot;
+    const int nplane = this->nplane;
+    const int* istot2ixy = this->istot2ixy;
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
-	for (int istot = 0;istot < nstot; ++istot)
-	{
-		int ixy = this->istot2ixy[istot];
+    for (int istot = 0;istot < nstot; ++istot)
+    {
+        int ixy = istot2ixy[istot];
         //int ixy = (ixy / fftny)*ny + ixy % fftny;
         std::complex<T> *outp = &out[ixy * nplane];
         std::complex<T> *inp = &in[istot * nplane];
-		for (int iz = 0; iz < nplane; ++iz)
-		{
-			outp[iz] = inp[iz];
-		}
+        for (int iz = 0; iz < nplane; ++iz)
+        {
+            outp[iz] = inp[iz];
+        }
     }
 #endif
     // ModuleBase::timer::start(this->classname, "gathers_scatterp");
diff --git a/source/source_basis/module_pw/pw_transform.cpp b/source/source_basis/module_pw/pw_transform.cpp
index 2195517d49d..d0980d33b81 100644
--- a/source/source_basis/module_pw/pw_transform.cpp
+++ b/source/source_basis/module_pw/pw_transform.cpp
@@ -30,10 +30,14 @@ void PW_Basis::real2recip(const std::complex<FPTYPE>* in,
     ModuleBase::timer::start(this->classname, "real2recip");
 
     assert(this->gamma_only == false);
+    const int nrxx = this->nrxx;
+    const int npw = this->npw;
+    const int nxyz = this->nxyz;
+    const int* ig2isz = this->ig2isz;
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (int ir = 0; ir < this->nrxx; ++ir)
+    for (int ir = 0; ir < nrxx; ++ir)
     {
         this->fft_bundle.get_auxr_data<FPTYPE>()[ir] = in[ir];
     }
@@ -45,24 +49,24 @@ void PW_Basis::real2recip(const std::complex<FPTYPE>* in,
 
     if (add)
     {
-        FPTYPE tmpfac = factor / FPTYPE(this->nxyz);
+        FPTYPE tmpfac = factor / FPTYPE(nxyz);
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for (int ig = 0; ig < this->npw; ++ig)
+        for (int ig = 0; ig < npw; ++ig)
         {
-            out[ig] += tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[this->ig2isz[ig]];
+            out[ig] += tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[ig2isz[ig]];
         }
     }
     else
     {
-        FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz);
+        FPTYPE tmpfac = 1.0 / FPTYPE(nxyz);
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for (int ig = 0; ig < this->npw; ++ig)
+        for (int ig = 0; ig < npw; ++ig)
         {
-            out[ig] = tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[this->ig2isz[ig]];
+            out[ig] = tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[ig2isz[ig]];
         }
     }
     ModuleBase::timer::end(this->classname, "real2recip");
@@ -79,13 +83,20 @@ template <typename FPTYPE>
 void PW_Basis::real2recip(const FPTYPE* in, std::complex<FPTYPE>* out, const bool add, const FPTYPE factor) const
 {
     ModuleBase::timer::start(this->classname, "real2recip");
+    const int nrxx = this->nrxx;
+    const int npw = this->npw;
+    const int nxyz = this->nxyz;
+    const int* ig2isz = this->ig2isz;
+    const int nx = this->nx;
+    const int ny = this->ny;
+    const int nplane = this->nplane;
     if (this->gamma_only)
     {
-        const int npy = this->ny * this->nplane;
+        const int npy = ny * nplane;
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) schedule(static)
 #endif
-        for (int ix = 0; ix < this->nx; ++ix)
+        for (int ix = 0; ix < nx; ++ix)
         {
             for (int ipy = 0; ipy < npy; ++ipy)
             {
@@ -100,7 +111,7 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex<FPTYPE>* out, const boo
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for (int ir = 0; ir < this->nrxx; ++ir)
+        for (int ir = 0; ir < nrxx; ++ir)
         {
             this->fft_bundle.get_auxr_data<FPTYPE>()[ir] = std::complex<FPTYPE>(in[ir], 0);
         }
@@ -112,24 +123,24 @@ void PW_Basis::real2recip(const FPTYPE* in, std::complex<FPTYPE>* out, const boo
 
     if (add)
     {
-        FPTYPE tmpfac = factor / FPTYPE(this->nxyz);
+        FPTYPE tmpfac = factor / FPTYPE(nxyz);
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for (int ig = 0; ig < this->npw; ++ig)
+        for (int ig = 0; ig < npw; ++ig)
         {
-            out[ig] += tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[this->ig2isz[ig]];
+            out[ig] += tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[ig2isz[ig]];
         }
     }
     else
     {
-        FPTYPE tmpfac = 1.0 / FPTYPE(this->nxyz);
+        FPTYPE tmpfac = 1.0 / FPTYPE(nxyz);
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for (int ig = 0; ig < this->npw; ++ig)
+        for (int ig = 0; ig < npw; ++ig)
         {
-            out[ig] = tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[this->ig2isz[ig]];
+            out[ig] = tmpfac * this->fft_bundle.get_auxg_data<FPTYPE>()[ig2isz[ig]];
         }
     }
     ModuleBase::timer::end(this->classname, "real2recip");
@@ -150,10 +161,15 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in,
 {
     ModuleBase::timer::start(this->classname, "recip2real");
     assert(this->gamma_only == false);
+    const int nst = this->nst;
+    const int nz = this->nz;
+    const int npw = this->npw;
+    const int nrxx = this->nrxx;
+    const int* ig2isz = this->ig2isz;
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (int i = 0; i < this->nst * this->nz; ++i)
+    for (int i = 0; i < nst * nz; ++i)
     {
         fft_bundle.get_auxg_data<FPTYPE>()[i] = std::complex<FPTYPE>(0, 0);
     }
@@ -161,9 +177,9 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in,
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (int ig = 0; ig < this->npw; ++ig)
+    for (int ig = 0; ig < npw; ++ig)
     {
-        this->fft_bundle.get_auxg_data<FPTYPE>()[this->ig2isz[ig]] = in[ig];
+        this->fft_bundle.get_auxg_data<FPTYPE>()[ig2isz[ig]] = in[ig];
     }
     this->fft_bundle.fftzbac(fft_bundle.get_auxg_data<FPTYPE>(), fft_bundle.get_auxg_data<FPTYPE>());
 
@@ -176,7 +192,7 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in,
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for (int ir = 0; ir < this->nrxx; ++ir)
+        for (int ir = 0; ir < nrxx; ++ir)
         {
             out[ir] += factor * this->fft_bundle.get_auxr_data<FPTYPE>()[ir];
         }
@@ -186,7 +202,7 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in,
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-        for (int ir = 0; ir < this->nrxx; ++ir)
+        for (int ir = 0; ir < nrxx; ++ir)
         {
             out[ir] = this->fft_bundle.get_auxr_data<FPTYPE>()[ir];
         }
@@ -205,10 +221,18 @@ template <typename FPTYPE>
 void PW_Basis::recip2real(const std::complex<FPTYPE>* in, FPTYPE* out, const bool add, const FPTYPE factor) const
 {
     ModuleBase::timer::start(this->classname, "recip2real");
+    const int nst = this->nst;
+    const int nz = this->nz;
+    const int npw = this->npw;
+    const int nrxx = this->nrxx;
+    const int nx = this->nx;
+    const int ny = this->ny;
+    const int nplane = this->nplane;
+    const int* ig2isz = this->ig2isz;
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (int i = 0; i < this->nst * this->nz; ++i)
+    for (int i = 0; i < nst * nz; ++i)
     {
         fft_bundle.get_auxg_data<FPTYPE>()[i] = std::complex<FPTYPE>(0, 0);
     }
@@ -216,9 +240,9 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in, FPTYPE* out, const boo
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-    for (int ig = 0; ig < this->npw; ++ig)
+    for (int ig = 0; ig < npw; ++ig)
     {
-        this->fft_bundle.get_auxg_data<FPTYPE>()[this->ig2isz[ig]] = in[ig];
+        this->fft_bundle.get_auxg_data<FPTYPE>()[ig2isz[ig]] = in[ig];
     }
     this->fft_bundle.fftzbac(fft_bundle.get_auxg_data<FPTYPE>(), fft_bundle.get_auxg_data<FPTYPE>());
 
@@ -228,15 +252,14 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in, FPTYPE* out, const boo
     {
         this->fft_bundle.fftxyc2r(fft_bundle.get_auxr_data<FPTYPE>(), fft_bundle.get_rspace_data<FPTYPE>());
 
-        // r2c in place
-        const int npy = this->ny * this->nplane;
+        const int npy = ny * nplane;
 
         if (add)
         {
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) schedule(static)
 #endif
-            for (int ix = 0; ix < this->nx; ++ix)
+            for (int ix = 0; ix < nx; ++ix)
             {
                 for (int ipy = 0; ipy < npy; ++ipy)
                 {
@@ -249,7 +272,7 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in, FPTYPE* out, const boo
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) schedule(static)
 #endif
-            for (int ix = 0; ix < this->nx; ++ix)
+            for (int ix = 0; ix < nx; ++ix)
             {
                 for (int ipy = 0; ipy < npy; ++ipy)
                 {
@@ -266,7 +289,7 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in, FPTYPE* out, const boo
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-            for (int ir = 0; ir < this->nrxx; ++ir)
+            for (int ir = 0; ir < nrxx; ++ir)
             {
                 out[ir] += factor * this->fft_bundle.get_auxr_data<FPTYPE>()[ir].real();
             }
@@ -276,7 +299,7 @@ void PW_Basis::recip2real(const std::complex<FPTYPE>* in, FPTYPE* out, const boo
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
-            for (int ir = 0; ir < this->nrxx; ++ir)
+            for (int ir = 0; ir < nrxx; ++ir)
             {
                 out[ir] = this->fft_bundle.get_auxr_data<FPTYPE>()[ir].real();
             }

From 5087b2d11852aa0dc52a5800adb59eb944f2492e Mon Sep 17 00:00:00 2001
From: abacus_fixer <mohanchen@pku.eud.cn>
Date: Fri, 15 May 2026 17:12:39 +0800
Subject: [PATCH 4/7] fix potential data race

---
 source/source_estate/module_charge/charge.cpp | 108 ++++++++++--------
 1 file changed, 62 insertions(+), 46 deletions(-)

diff --git a/source/source_estate/module_charge/charge.cpp b/source/source_estate/module_charge/charge.cpp
index 62a9e702f90..94fa0fd4548 100644
--- a/source/source_estate/module_charge/charge.cpp
+++ b/source/source_estate/module_charge/charge.cpp
@@ -359,7 +359,9 @@ void Charge::atomic_rho(const int spin_number_need,
                             gstart = 1;
                         }
                         if (PARAM.inp.test_charge > 0)
+                        {
                             std::cout << "\n |G|=0 term done." << std::endl;
+                        }
                             //----------------------------------------------------------
                             // Here we compute the G<>0 term
                             // But if in parallel case
@@ -370,14 +372,18 @@ void Charge::atomic_rho(const int spin_number_need,
     #pragma omp parallel
                         {
     #endif
-                            std::vector<double> rho1d(ucell.meshx);
+                            const int ngg = this->rhopw->ngg;
+                            const double* gg_uniq = this->rhopw->gg_uniq;
+                            const int meshx = ucell.meshx;
+                            const double tpiba = ucell.tpiba;
+                            std::vector<double> rho1d(meshx);
 
     #ifdef _OPENMP
     #pragma omp for
     #endif
-                            for (int igg = gstart; igg < this->rhopw->ngg; ++igg)
+                            for (int igg = gstart; igg < ngg; ++igg)
                             {
-                                const double gx = sqrt(this->rhopw->gg_uniq[igg]) * ucell.tpiba;
+                                const double gx = sqrt(gg_uniq[igg]) * tpiba;
                                 for (int ir = 0; ir < mesh; ir++)
                                 {
                                     if (atom->ncpp.r[ir] < 1.0e-8)
@@ -397,7 +403,9 @@ void Charge::atomic_rho(const int spin_number_need,
     #endif
                             {
                                 if (PARAM.inp.test_charge > 0)
+                                {
                                     std::cout << " |G|>0 term done." << std::endl;
+                                }
                             }
                             //----------------------------------------------------------
                             // EXPLAIN : Complete the transfer of rho from real space to
@@ -406,7 +414,7 @@ void Charge::atomic_rho(const int spin_number_need,
     #ifdef _OPENMP
     #pragma omp for
     #endif
-							for (int igg = 0; igg < this->rhopw->ngg; igg++)
+							for (int igg = 0; igg < ngg; igg++)
 							{
 								rho_lgl[igg] /= omega;
 							}
@@ -420,12 +428,14 @@ void Charge::atomic_rho(const int spin_number_need,
                     //----------------------------------------------------------
                     if (spin_number_need == 1)
                     {
+                        const int npw = this->rhopw->npw;
+                        const int* ig2igg = this->rhopw->ig2igg;
     #ifdef _OPENMP
     #pragma omp parallel for
     #endif
-                        for (int ig = 0; ig < this->rhopw->npw; ig++)
+                        for (int ig = 0; ig < npw; ig++)
                         {
-                            rho_g3d(0, ig) += strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]];
+                            rho_g3d(0, ig) += strucFac(it, ig) * rho_lgl[ig2igg[ig]];
                         }
                     }
                     // mohan add 2011-06-14, initialize the charge density according to each atom
@@ -433,14 +443,18 @@ void Charge::atomic_rho(const int spin_number_need,
                     {
                         if (startmag_type == 1)
                         {
+                            const int npw = this->rhopw->npw;
+                            const int* ig2igg = this->rhopw->ig2igg;
+                            const double zv = atom->ncpp.zv;
+                            const double start_mag_it = ucell.magnet.start_mag[it];
     #ifdef _OPENMP
     #pragma omp parallel for
     #endif
-                            for (int ig = 0; ig < this->rhopw->npw; ig++)
+                            for (int ig = 0; ig < npw; ig++)
                             {
-                                const std::complex<double> swap = strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]];
-                                const double up = 0.5 * (1 + ucell.magnet.start_mag[it] / atom->ncpp.zv);
-                                const double dw = 0.5 * (1 - ucell.magnet.start_mag[it] / atom->ncpp.zv);
+                                const std::complex<double> swap = strucFac(it, ig) * rho_lgl[ig2igg[ig]];
+                                const double up = 0.5 * (1 + start_mag_it / zv);
+                                const double dw = 0.5 * (1 - start_mag_it / zv);
                                 rho_g3d(0, ig) += swap * up;
                                 rho_g3d(1, ig) += swap * dw;
                             }
@@ -449,25 +463,24 @@ void Charge::atomic_rho(const int spin_number_need,
                         else if (startmag_type == 2)
                         {
                             std::complex<double> ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI;
+                            const int npw = this->rhopw->npw;
+                            const ModuleBase::Vector3<double>* gcar = this->rhopw->gcar;
+                            const int* ig2igg = this->rhopw->ig2igg;
+                            const double zv = atom->ncpp.zv;
                             for (int ia = 0; ia < atom->na; ia++)
                             {
-                                // const double up = 0.5 * ( 1 + atom->mag[ia] );
-                                // const double dw = 0.5 * ( 1 - atom->mag[ia] );
                                 const double up = 0.5 * (1 + atom->mag[ia] / atom->ncpp.zv);
                                 const double dw = 0.5 * (1 - atom->mag[ia] / atom->ncpp.zv);
-                                // std::cout << " atom " << ia << " up=" << up << " dw=" << dw << std::endl;
+                                const double tau_x = atom->tau[ia].x;
+                                const double tau_y = atom->tau[ia].y;
+                                const double tau_z = atom->tau[ia].z;
     #ifdef _OPENMP
     #pragma omp parallel for
     #endif
-                                for (int ig = 0; ig < this->rhopw->npw; ig++)
+                                for (int ig = 0; ig < npw; ig++)
                                 {
-                                    const double Gtau = this->rhopw->gcar[ig][0] * atom->tau[ia].x
-                                                        + this->rhopw->gcar[ig][1] * atom->tau[ia].y
-                                                        + this->rhopw->gcar[ig][2] * atom->tau[ia].z;
-
-                                    std::complex<double> swap
-                                        = ModuleBase::libm::exp(ci_tpi * Gtau) * rho_lgl[this->rhopw->ig2igg[ig]];
-
+                                    const double Gtau = gcar[ig][0] * tau_x + gcar[ig][1] * tau_y + gcar[ig][2] * tau_z;
+                                    std::complex<double> swap = ModuleBase::libm::exp(ci_tpi * Gtau) * rho_lgl[ig2igg[ig]];
                                     rho_g3d(0, ig) += swap * up;
                                     rho_g3d(1, ig) += swap * dw;
                                 }
@@ -481,37 +494,42 @@ void Charge::atomic_rho(const int spin_number_need,
                         {
                             double sin_a1, sin_a2, cos_a1, cos_a2;
                             if (PARAM.globalv.domag)
-                            { // will not be used now, will be deleted later
+                            {
                                 ModuleBase::libm::sincos(atom->angle1[0], &sin_a1, &cos_a1);
                                 ModuleBase::libm::sincos(atom->angle2[0], &sin_a2, &cos_a2);
                             }
+                            const int npw = this->rhopw->npw;
+                            const int* ig2igg = this->rhopw->ig2igg;
+                            const double zv = atom->ncpp.zv;
+                            const double start_mag_it = ucell.magnet.start_mag[it];
     #ifdef _OPENMP
     #pragma omp parallel for
     #endif
-                            for (int ig = 0; ig < this->rhopw->npw; ig++)
+                            for (int ig = 0; ig < npw; ig++)
                             {
-                                const std::complex<double> swap = strucFac(it, ig) * rho_lgl[this->rhopw->ig2igg[ig]];
+                                const std::complex<double> swap = strucFac(it, ig) * rho_lgl[ig2igg[ig]];
                                 rho_g3d(0, ig) += swap;
                                 if (PARAM.globalv.domag)
-                                { // will not be used now, will be deleted later
-                                    rho_g3d(1, ig)
-                                        += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * sin_a1 * cos_a2;
-                                    rho_g3d(2, ig)
-                                        += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * sin_a1 * sin_a2;
-                                    rho_g3d(3, ig)
-                                        += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv) * cos_a1;
+                                {
+                                    rho_g3d(1, ig) += swap * (start_mag_it / zv) * sin_a1 * cos_a2;
+                                    rho_g3d(2, ig) += swap * (start_mag_it / zv) * sin_a1 * sin_a2;
+                                    rho_g3d(3, ig) += swap * (start_mag_it / zv) * cos_a1;
                                 }
                                 else if (PARAM.globalv.domag_z)
                                 {
                                     rho_g3d(1, ig) = 0.0;
                                     rho_g3d(2, ig) = 0.0;
-                                    rho_g3d(3, ig) += swap * (ucell.magnet.start_mag[it] / atom->ncpp.zv);
+                                    rho_g3d(3, ig) += swap * (start_mag_it / zv);
                                 }
                             }
                         }
                         else if (startmag_type == 2)
-                        { // zdy-warning-not-available
+                        {
                             std::complex<double> ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI;
+                            const int npw = this->rhopw->npw;
+                            const ModuleBase::Vector3<double>* gcar = this->rhopw->gcar;
+                            const int* ig2igg = this->rhopw->ig2igg;
+                            const double zv = atom->ncpp.zv;
                             for (int ia = 0; ia < atom->na; ia++)
                             {
                                 double sin_a1, sin_a2, cos_a1, cos_a2;
@@ -523,29 +541,27 @@ void Charge::atomic_rho(const int spin_number_need,
                                 {
                                     ModuleBase::libm::sincos(atom->angle2[ia], &sin_a2, &cos_a2);
                                 }
+                                const double mag_ia = atom->mag[ia];
+                                const double tau_x = atom->tau[ia].x;
+                                const double tau_y = atom->tau[ia].y;
+                                const double tau_z = atom->tau[ia].z;
     #ifdef _OPENMP
     #pragma omp parallel for
     #endif
-                                for (int ig = 0; ig < this->rhopw->npw; ig++)
+                                for (int ig = 0; ig < npw; ig++)
                                 {
-                                    const double Gtau = this->rhopw->gcar[ig][0] * atom->tau[ia].x
-                                                        + this->rhopw->gcar[ig][1] * atom->tau[ia].y
-                                                        + this->rhopw->gcar[ig][2] * atom->tau[ia].z;
-
-                                    std::complex<double> swap = exp(ci_tpi * Gtau) * rho_lgl[this->rhopw->ig2igg[ig]];
-
-                                    // calculate rho_total
+                                    const double Gtau = gcar[ig][0] * tau_x + gcar[ig][1] * tau_y + gcar[ig][2] * tau_z;
+                                    std::complex<double> swap = exp(ci_tpi * Gtau) * rho_lgl[ig2igg[ig]];
+                                    const double mag_factor = mag_ia / zv;
                                     rho_g3d(0, ig) += swap;
-                                    // calculate mag_z
                                     if (PARAM.globalv.domag || PARAM.globalv.domag_z)
                                     {
-                                        rho_g3d(3, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * cos_a1;
+                                        rho_g3d(3, ig) += swap * mag_factor * cos_a1;
                                     }
-                                    // calculate mag_x and mag_y
                                     if (PARAM.globalv.domag)
                                     {
-                                        rho_g3d(1, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * sin_a1 * cos_a2;
-                                        rho_g3d(2, ig) += swap * (atom->mag[ia] / atom->ncpp.zv) * sin_a1 * sin_a2;
+                                        rho_g3d(1, ig) += swap * mag_factor * sin_a1 * cos_a2;
+                                        rho_g3d(2, ig) += swap * mag_factor * sin_a1 * sin_a2;
                                     }
                                     else
                                     {

From 1755bfbc577d1bdbb1b57c197406fa3dd2a5a199 Mon Sep 17 00:00:00 2001
From: abacus_fixer <mohanchen@pku.eud.cn>
Date: Fri, 15 May 2026 17:16:26 +0800
Subject: [PATCH 5/7] update format

---
 source/source_base/math_integral.cpp | 44 ++++++++++++++--------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/source/source_base/math_integral.cpp b/source/source_base/math_integral.cpp
index 0dd8434bc37..5b4e7cc125e 100644
--- a/source/source_base/math_integral.cpp
+++ b/source/source_base/math_integral.cpp
@@ -37,7 +37,7 @@ void Integral::Simpson_Integral
     
     //  simpson's rule integrator for function stored on the
     //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
+    //    routine assumes that mesh is an odd number so run check
     if (mesh % 2 == 0)
     {
         std::cout << "\n error in subroutine simpson ";
@@ -83,21 +83,21 @@ void Integral::Simpson_Integral
     */
     //  simpson's rule integrator for function stored on the
     //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
+    //    routine assumes that mesh is an odd number so run check
     assert(mesh&1);
 
     asum = 0.00;
-	const size_t end = mesh-2;
+    const size_t end = mesh-2;
     for( size_t i=1; i!=end; i+=2 )
     {
-		const double f1 = func[i]*rab[i];
-		asum += f1 + f1 + func[i+1]*rab[i+1];
+        const double f1 = func[i]*rab[i];
+        asum += f1 + f1 + func[i+1]*rab[i+1];
     }
-	const double f1 = func[mesh-2]*rab[mesh-2];
-	asum += f1+f1;
-	asum += asum;
-	asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1];
-	asum /= 3.0;
+    const double f1 = func[mesh-2]*rab[mesh-2];
+    asum += f1+f1;
+    asum += asum;
+    asum += func[0]*rab[0] + func[mesh-1]*rab[mesh-1];
+    asum /= 3.0;
     return;
 }// end subroutine simpson
 
@@ -124,21 +124,21 @@ void Integral::Simpson_Integral
     */
     //  simpson's rule integrator for function stored on the
     //  radial logarithmic mesh
-    //	routine assumes that mesh is an odd number so run check
+    //    routine assumes that mesh is an odd number so run check
     assert(mesh&1);
 
     asum = 0.00;
-	const size_t end = mesh-2;
+    const size_t end = mesh-2;
     for(size_t i=1; i!=end; i+=2 )
     {
-		const double f1 = func[i];
-		asum += f1 + f1 + func[i+1];
+        const double f1 = func[i];
+        asum += f1 + f1 + func[i+1];
     }
-	const double f1 = func[mesh-2];
-	asum += f1+f1;
-	asum += asum;
-	asum += func[0] + func[mesh-1];
-	asum *= dr/3.0;
+    const double f1 = func[mesh-2];
+    asum += f1+f1;
+    asum += asum;
+    asum += func[0] + func[mesh-1];
+    asum *= dr/3.0;
     return;
 }// end subroutine simpson
 
@@ -220,10 +220,10 @@ void Integral::Simpson_Integral_alltoinf
 
     const double asum_all = asum[mesh-1];
     for (int i = 0;i < mesh; ++i)
-	{
+    {
         asum[i] = asum_all - asum[i];
-	}
-	return;
+    }
+    return;
 }
 
 double Integral::simpson(const int n, const double* const f, const double dx) 

From 23d40e3dd8f4812fe5a6db86e79b7efe8cf8ff29 Mon Sep 17 00:00:00 2001
From: abacus_fixer <mohanchen@pku.eud.cn>
Date: Fri, 15 May 2026 17:28:41 +0800
Subject: [PATCH 6/7] fix data race in fft

---
 source/source_base/module_fft/fft_cpu.cpp | 114 +++++++++++++++-------
 1 file changed, 80 insertions(+), 34 deletions(-)

diff --git a/source/source_base/module_fft/fft_cpu.cpp b/source/source_base/module_fft/fft_cpu.cpp
index f50f6e9e868..c486c31642a 100644
--- a/source/source_base/module_fft/fft_cpu.cpp
+++ b/source/source_base/module_fft/fft_cpu.cpp
@@ -344,60 +344,86 @@ void FFT_CPU<double>::clear()
 template <>
 void FFT_CPU<double>::fftxyfor(std::complex<double>* in, std::complex<double>* out) const
 {
-    int npy = this->nplane * this->ny;
+    const int npy = this->nplane * this->ny;
+    const int nx = this->nx;
+    const int lixy = this->lixy;
+    const int rixy = this->rixy;
+    const int nplane = this->nplane;
+    const fftw_plan planyfor = this->planyfor;
+    const fftw_plan planxfor1 = this->planxfor1;
+    const fftw_plan planxfor2 = this->planxfor2;
     if (this->xprime)
     {
         
-        fftw_execute_dft(this->planxfor1, (fftw_complex*)in, (fftw_complex*)out);
+        fftw_execute_dft(planxfor1, (fftw_complex*)in, (fftw_complex*)out);
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->lixy + 1; ++i)
+#endif
+        for (int i = 0; i < lixy + 1; ++i)
         {
-            fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft(planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = rixy; i < this->nx; ++i)
+#endif
+        for (int i = rixy; i < nx; ++i)
         {
-            fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft(planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
     }
     else
     {
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->nx; ++i)
+#endif
+        for (int i = 0; i < nx; ++i)
         {
-            fftw_execute_dft(this->planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft(planyfor, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
-        fftw_execute_dft(this->planxfor1, (fftw_complex*)in, (fftw_complex*)out);
-        fftw_execute_dft(this->planxfor2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]);
+        fftw_execute_dft(planxfor1, (fftw_complex*)in, (fftw_complex*)out);
+        fftw_execute_dft(planxfor2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]);
     }
 }
 
 template <>
 void FFT_CPU<double>::fftxybac(std::complex<double>* in,std::complex<double>* out) const
 {
-    int npy = this->nplane * this->ny;
+    const int npy = this->nplane * this->ny;
+    const int nx = this->nx;
+    const int lixy = this->lixy;
+    const int rixy = this->rixy;
+    const int nplane = this->nplane;
+    const fftw_plan planybac = this->planybac;
+    const fftw_plan planxbac1 = this->planxbac1;
+    const fftw_plan planxbac2 = this->planxbac2;
     if (this->xprime)
     {
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->lixy + 1; ++i)
+#endif
+        for (int i = 0; i < lixy + 1; ++i)
         {
-            fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = rixy; i < this->nx; ++i)
+#endif
+        for (int i = rixy; i < nx; ++i)
         {
-            fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
-        fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)out);
+        fftw_execute_dft(planxbac1, (fftw_complex*)in, (fftw_complex*)out);
     }
     else
     {
-        fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)out);
-        fftw_execute_dft(this->planxbac2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]);
+        fftw_execute_dft(planxbac1, (fftw_complex*)in, (fftw_complex*)out);
+        fftw_execute_dft(planxbac2, (fftw_complex*)&in[rixy * nplane], (fftw_complex*)&out[rixy * nplane]);
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->nx; ++i)
+#endif
+        for (int i = 0; i < nx; ++i)
         {
-            fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&out[i * npy]);
         }
     }
 }
@@ -417,47 +443,67 @@ void FFT_CPU<double>::fftzbac(std::complex<double>* in, std::complex<double>* ou
 template <>
 void FFT_CPU<double>::fftxyr2c(double* in, std::complex<double>* out) const
 {
-    int npy = this->nplane * this->ny;
+    const int npy = this->nplane * this->ny;
+    const int nx = this->nx;
+    const int lixy = this->lixy;
+    const fftw_plan planxr2c = this->planxr2c;
+    const fftw_plan planyfor = this->planyfor;
+    const fftw_plan planyr2c = this->planyr2c;
+    const fftw_plan planxfor1 = this->planxfor1;
     if (this->xprime)
     {
-        fftw_execute_dft_r2c(this->planxr2c, in, (fftw_complex*)out);
+        fftw_execute_dft_r2c(planxr2c, in, (fftw_complex*)out);
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->lixy + 1; ++i)
+#endif
+        for (int i = 0; i < lixy + 1; ++i)
         {
-            fftw_execute_dft(this->planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft(planyfor, (fftw_complex*)&out[i * npy], (fftw_complex*)&out[i * npy]);
         }
     }
     else
     {
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->nx; ++i)
+#endif
+        for (int i = 0; i < nx; ++i)
         {
-            fftw_execute_dft_r2c(this->planyr2c, &in[i * npy], (fftw_complex*)&out[i * npy]);
+            fftw_execute_dft_r2c(planyr2c, &in[i * npy], (fftw_complex*)&out[i * npy]);
         }
-        fftw_execute_dft(this->planxfor1, (fftw_complex*)out, (fftw_complex*)out);
+        fftw_execute_dft(planxfor1, (fftw_complex*)out, (fftw_complex*)out);
     }
 }
 
 template <>
 void FFT_CPU<double>::fftxyc2r(std::complex<double> *in,double *out) const
 {
-    int npy = this->nplane * this->ny;
+    const int npy = this->nplane * this->ny;
+    const int nx = this->nx;
+    const int lixy = this->lixy;
+    const fftw_plan planybac = this->planybac;
+    const fftw_plan planxc2r = this->planxc2r;
+    const fftw_plan planxbac1 = this->planxbac1;
+    const fftw_plan planyc2r = this->planyc2r;
     if (this->xprime)
     {
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->lixy + 1; ++i)
+#endif
+        for (int i = 0; i < lixy + 1; ++i)
         {
-            fftw_execute_dft(this->planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&in[i * npy]);
+            fftw_execute_dft(planybac, (fftw_complex*)&in[i * npy], (fftw_complex*)&in[i * npy]);
         }
-        fftw_execute_dft_c2r(this->planxc2r, (fftw_complex*)in, out);
+        fftw_execute_dft_c2r(planxc2r, (fftw_complex*)in, out);
     }
     else
     {
-        fftw_execute_dft(this->planxbac1, (fftw_complex*)in, (fftw_complex*)in);
+        fftw_execute_dft(planxbac1, (fftw_complex*)in, (fftw_complex*)in);
+#ifdef _OPENMP
         #pragma omp parallel for
-        for (int i = 0; i < this->nx; ++i)
+#endif
+        for (int i = 0; i < nx; ++i)
         {
-            fftw_execute_dft_c2r(this->planyc2r, (fftw_complex*)&in[i * npy], &out[i * npy]);
+            fftw_execute_dft_c2r(planyc2r, (fftw_complex*)&in[i * npy], &out[i * npy]);
         }
     }
 }

From 76948b97cc63d1d2ec12fa8d1246c802dd8f6628 Mon Sep 17 00:00:00 2001
From: abacus_fixer <mohanchen@pku.eud.cn>
Date: Sat, 16 May 2026 22:06:56 +0800
Subject: [PATCH 7/7] fix

---
 source/source_basis/module_pw/pw_gatherscatter.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/source/source_basis/module_pw/pw_gatherscatter.h b/source/source_basis/module_pw/pw_gatherscatter.h
index 904666b7faa..ae42f15d624 100644
--- a/source/source_basis/module_pw/pw_gatherscatter.h
+++ b/source/source_basis/module_pw/pw_gatherscatter.h
@@ -60,6 +60,12 @@ void PW_Basis::gatherp_scatters(std::complex<T>* in, std::complex<T>* out) const
 
     //exchange data
     //(nplane,nstot) to (numz[ip],ns, poolnproc)
+    // OMP barrier: Ensure all OMP threads have finished writing to buffers
+    // before MPI communication reads from them. This prevents data race
+    // between OMP parallel write (e.g., FFT calculation) and MPI read.
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
     if(typeid(T) == typeid(double))
     {
         MPI_Alltoallv(out, numr, startr, MPI_DOUBLE_COMPLEX, in, numg, startg, MPI_DOUBLE_COMPLEX, this->pool_world);
@@ -170,6 +176,12 @@ void PW_Basis::gathers_scatterp(std::complex<T>* in, std::complex<T>* out) const
 
     //exchange data
     //(numz[ip],ns, poolnproc) to (nplane,nstot)
+    // OMP barrier: Ensure all OMP threads have finished writing to buffers
+    // before MPI communication reads from them. This prevents data race
+    // between OMP parallel write (e.g., FFT calculation) and MPI read.
+#ifdef _OPENMP
+#pragma omp barrier
+#endif
     if(typeid(T) == typeid(double))
     {
         MPI_Alltoallv(out, numg, startg, MPI_DOUBLE_COMPLEX, in, numr, startr, MPI_DOUBLE_COMPLEX, this->pool_world);