diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a21d54b2a82..fddc76f5c1f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -237,9 +237,16 @@ jobs:
         run: |
           ctest --test-dir build -V --timeout 1700 -R 10_others
           
+      - name: 17_DS_DFTU Test
+        env:
+          GTEST_COLOR: 'yes'
+          OMP_NUM_THREADS: '2'
+        run: |
+          ctest --test-dir build -V --timeout 1700 -R 17_DS_DFTU
+          
       - name: Other Unittests
         env:
           GTEST_COLOR: 'yes'
           OMP_NUM_THREADS: '2'
         run: |
-          ctest --test-dir build -V --timeout 1700 -E 'integrate_test|01_PW|02_NAO_Gamma|03_NAO_multik|04_FF|05_rtTDDFT|06_SDFT|07_OFDFT|08_EXX|09_DeePKS|10_others|11_PW_GPU|12_NAO_Gamma_GPU|13_NAO_multik_GPU|15_rtTDDFT_GPU|16_SDFT_GPU|MODULE_BASE|MODULE_IO|MODULE_HSOLVER|MODULE_CELL|MODULE_MD|MODULE_PSI|MODULE_ESTATE|MODULE_RI|MODULE_HAMILT|MODULE_PW|MODULE_LCAO|MODULE_AO|MODULE_NAO|MODULE_RELAX|MODULE_LR'
+          ctest --test-dir build -V --timeout 1700 -E 'integrate_test|01_PW|02_NAO_Gamma|03_NAO_multik|04_FF|05_rtTDDFT|06_SDFT|07_OFDFT|08_EXX|09_DeePKS|10_others|11_PW_GPU|12_NAO_Gamma_GPU|13_NAO_multik_GPU|15_rtTDDFT_GPU|16_SDFT_GPU|17_DS_DFTU|MODULE_BASE|MODULE_IO|MODULE_HSOLVER|MODULE_CELL|MODULE_MD|MODULE_PSI|MODULE_ESTATE|MODULE_RI|MODULE_HAMILT|MODULE_PW|MODULE_LCAO|MODULE_AO|MODULE_NAO|MODULE_RELAX|MODULE_LR'
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index b5f458c21ea..8df45d94d3c 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -394,6 +394,8 @@
     - [sccut](#sccut)
     - [sc\_drop\_thr](#sc_drop_thr)
     - [sc\_scf\_thr](#sc_scf_thr)
+    - [sc\_direction\_only](#sc_direction_only)
+    - [sc\_lambda\_strategy](#sc_lambda_strategy)
   - [vdW correction](#vdw-correction)
     - [vdw\_method](#vdw_method)
     - [vdw\_s6](#vdw_s6)
@@ -3481,8 +3483,8 @@
 
 - **Type**: Integer
 - **Description**: Determines whether to calculate the plus U correction, which is especially important for correlated electrons.
-  - 1: Calculate plus U correction with radius-adjustable localized projections (with parameter onsite_radius).
-  - 2: Calculate plus U correction using first zeta of NAOs as projections (this is old method for testing).
+  - 1: Calculate plus U correction with radius-adjustable localized projections (with parameter onsite_radius). Supported for both PW and LCAO basis sets.
+  - 2: Calculate plus U correction using first zeta of NAOs as projections (this is old method for testing). Only available for LCAO basis.
   - 0: Do not calculate plus U correction.
 - **Default**: 0
 
@@ -3629,6 +3631,24 @@
 - **Description**: Density error threshold for inner loop of spin-constrained SCF
 - **Default**: 1.0e-4
 
+### sc_direction_only
+
+- **Type**: Boolean
+- **Availability**: *sc_mag_switch is true*
+- **Description**: When true, only the direction of the magnetic moment is constrained to the target direction, while the magnitude is allowed to vary freely. This is useful for studying magnetic anisotropy or when the magnitude of the moment is determined by the electronic structure rather than an external constraint. When false (default), both the direction and magnitude of the magnetic moment are constrained to the target values.
+- **Default**: False
+
+### sc_lambda_strategy
+
+- **Type**: String
+- **Availability**: *sc_mag_switch is true*
+- **Description**: Lambda update strategy for spin-constrained DFT. Available options are:
+  - bfgs: BFGS quasi-Newton method (default, robust and well-tested)
+  - linear_response: linear response method (Scheme B)
+  - augmented_lagrangian: augmented Lagrangian method (Scheme C)
+  - hybrid_delayed: hybrid delayed update (Scheme D)
+- **Default**: bfgs
+
 [back to top](#full-list-of-input-keywords)
 
 ## vdW correction
diff --git a/docs/advanced/scf/construct_H.md b/docs/advanced/scf/construct_H.md
index 69a22ad80e9..3100b934876 100644
--- a/docs/advanced/scf/construct_H.md
+++ b/docs/advanced/scf/construct_H.md
@@ -77,6 +77,6 @@ Here, we use a simple [example calculation](https://github.com/deepmodeling/abac
 
 Conventional functionals, e.g., L(S)DA and GGAs, encounter failures in strongly correlated systems, usually characterized by partially filled *d*/*f* shells. These include transition metals (TM) and their oxides, rare-earth compounds, and actinides, to name a few, where L(S)DA/GGAs typically yield quantitatively or even qualitatively wrong results. To address this failure, an efficient and successful method named DFT+*U*, which inherits the efficiency of L(S)DA/GGA but gains the strength of the Hubbard model in describing the physics of strongly correlatedsystems, has been developed.
 
-Now the DFT+*U* method is accessible in ABACUS. The details of the DFT+*U* method could be found in this [paper](https://doi.org/10.1063/5.0090122). It should be noted that the DFT+*U* works only within the NAO scheme, which means that the value of the keyword `basis_type` must be lcao when DFT+*U* is called. To turn on DFT+*U*, users need to set the value of the `dft_plus_u` keyword in the `INPUT` file to be 1. All relevant parmeters used in DFT+*U* calculations are listed in the [DFT+*U* correction](../input_files/input-main.md#dftu-correction) part of the [list of keywords](../input_files/input-main.md).
+Now the DFT+*U* method is accessible in ABACUS. The details of the DFT+*U* method could be found in this [paper](https://doi.org/10.1063/5.0090122). DFT+*U* is supported for both LCAO (`basis_type = lcao`) and plane-wave (`basis_type = pw`) basis sets. For the PW basis, `dft_plus_u = 1` (radius-adjustable localized projections) is supported with `nspin = 1`, `2`, or `4`. For the LCAO basis, both `dft_plus_u = 1` and `dft_plus_u = 2` are available. To turn on DFT+*U*, users need to set the value of the `dft_plus_u` keyword in the `INPUT` file to be 1. All relevant parameters used in DFT+*U* calculations are listed in the [DFT+*U* correction](../input_files/input-main.md#dftu-correction) part of the [list of keywords](../input_files/input-main.md).
 
 Examples of DFT+*U* calculations are provided in this [directory](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/dft_plus_u).
diff --git a/docs/advanced/scf/spin.md b/docs/advanced/scf/spin.md
index 1749db156dc..2de590e3c28 100644
--- a/docs/advanced/scf/spin.md
+++ b/docs/advanced/scf/spin.md
@@ -28,6 +28,224 @@ If **"ocp=1"** and **"ocp_set"** is set in INPUT file, the occupations of states
 2. **"nupdown"**
 If **"nupdown"** is set to non-zero, number of spin-up and spin-down electrons will be fixed, and Fermi energy level will split to E_Fermi_up and E_Fermi_down. By the way, total magnetization will also be fixed, and will be the value of **"nupdown"**.
 
+## DeltaSpin (Spin-Constrained DFT)
+
+DeltaSpin is a spin-constrained DFT method that allows users to constrain the magnetic moments on individual atoms to target values during self-consistent field (SCF) calculations. This is useful for studying magnetic excitations, non-collinear magnetic structures, and systems where the magnetic ground state is not known a priori.
+
+The theoretical foundation and implementation details can be found in:
+
+- Cai Z, Wang K, Xu Y, et al., "A self-adaptive first-principles approach for magnetic excited states," *Quantum Frontiers* 2.1 (2023): 21. [DOI: 10.1007/s44214-023-00050-z](https://doi.org/10.1007/s44214-023-00050-z)
+- Zheng D, Peng X, Huang Y, et al., "Integrating deep-learning-based magnetic model and non-collinear spin-constrained method: methodology, implementation and application," *npj Computational Materials* (2026).
+
+### Enabling DeltaSpin
+
+Set `sc_mag_switch 1` in the INPUT file. DeltaSpin is supported for both PW (`basis_type = pw`) and LCAO (`basis_type = lcao`) basis sets, with `nspin = 2` (collinear) or `nspin = 4` (non-collinear).
+
+### Specifying Target Magnetic Moments in STRU
+
+Target magnetic moments and constraint flags are specified per atom in the `ATOMIC_POSITIONS` section of the STRU file, using the `mag` (or `magmom`), `sc`, `lambda`, `angle1`, and `angle2` keywords after the atomic coordinates.
+
+#### Collinear (nspin=2)
+
+For collinear spin, only the z-component of the magnetic moment is constrained:
+
+```
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0   sc 1
+0.51  0.51  0.51  mag  -2.0  sc 1
+```
+
+- `mag 2.0`: target magnetic moment of 2.0 $\mu_B$ along z-axis
+- `sc 1`: constrain the z-component (1 = constrained, 0 = unconstrained)
+
+#### Non-collinear (nspin=4), vector form
+
+For non-collinear spin, specify the magnetic moment as a vector (mx, my, mz):
+
+```
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0  0.0  0.0  sc 1 1 1
+0.51  0.51  0.51  mag  0.0  0.0  -2.0  sc 1 1 1
+```
+
+- `mag 2.0 0.0 0.0`: target moment vector in Cartesian coordinates ($\mu_B$)
+- `sc 1 1 1`: constrain x, y, z components respectively
+
+#### Non-collinear (nspin=4), angle form
+
+Alternatively, use `angle1` (polar angle $\theta$) and `angle2` (azimuthal angle $\phi$) in degrees to specify the direction:
+
+```
+0.00  0.00  0.00  mag 2.0  angle1 0  angle2 0    sc 1 1 1
+0.51  0.51  0.51  mag 2.0  angle1 180  angle2 0  sc 1 1 1
+```
+
+The Cartesian components are computed as:
+- $m_z = |\mathbf{m}| \cos\theta$
+- $m_x = |\mathbf{m}| \sin\theta \cos\phi$
+- $m_y = |\mathbf{m}| \sin\theta \sin\phi$
+
+#### Providing initial Lagrange multipliers
+
+Initial lambda values (in eV/$\mu_B$) can be provided via the `lambda` keyword to accelerate convergence:
+
+```
+0.00  0.00  0.00  mag 2.0  lambda 0.01 0.0 0.0  sc 1 1 1
+```
+
+A single value sets $\lambda_z$; three values set $\lambda_x$, $\lambda_y$, $\lambda_z$.
+
+#### Partial constraints
+
+Set `sc 0` for unconstrained components. For example, to constrain only the direction but not the magnitude (use with `sc_direction_only`):
+
+```
+0.00  0.00  0.00  mag 2.0  0.0  0.0  sc 1 1 0
+```
+
+### DeltaSpin INPUT Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `sc_mag_switch` | Boolean | False | Enable DeltaSpin |
+| `sc_thr` | Real | 1.0e-6 | Convergence criterion for lambda loop (RMS, in $\mu_B$) |
+| `nsc` | Integer | 100 | Maximum number of lambda iterations |
+| `nsc_min` | Integer | 2 | Minimum number of lambda iterations |
+| `sc_scf_nmin` | Integer | 2 | Minimum outer SCF iterations before starting lambda loop |
+| `alpha_trial` | Real | 0.01 | Initial trial step size for lambda (eV/$\mu_B^2$) |
+| `sccut` | Real | 3.0 | Maximum step size for lambda (eV/$\mu_B$) |
+| `sc_drop_thr` | Real | 1.0e-2 | Convergence ratio threshold for adaptive lambda loop |
+| `sc_scf_thr` | Real | 1.0e-4 | Density error threshold for entering lambda loop |
+| `sc_direction_only` | Boolean | False | Constrain only the direction, not the magnitude |
+| `sc_lambda_strategy` | String | bfgs | Lambda update strategy (see below) |
+| `decay_grad_switch` | Boolean | False | Enable gradient-based early exit |
+
+For full parameter details, see the [Spin-Constrained DFT](../input_files/input-main.md#spin-constrained-dft) section of the input keyword list.
+
+### Lambda Update Strategies
+
+The `sc_lambda_strategy` parameter controls how the Lagrange multipliers $\lambda$ are updated during the lambda loop:
+
+- **`bfgs`** (default): BFGS quasi-Newton method with line search. Robust and well-tested for both PW and LCAO. Uses `alpha_trial` and `sccut` to control step size.
+
+- **`linear_response`**: Linear response method (Scheme B). Estimates the magnetic susceptibility $\chi$ from the history of $(\lambda, M)$ pairs and performs a one-step Newton-like update: $\Delta\lambda = \beta (M_{\text{target}} - M) / \chi$, where $\beta$ is a mixing parameter.
+
+- **`augmented_lagrangian`**: Augmented Lagrangian method (Scheme C). Uses a penalty parameter $\mu$ that grows over iterations: $\lambda_{\text{new}} = \lambda + \mu (M - M_{\text{target}})$. The penalty increases until convergence is achieved.
+
+- **`hybrid_delayed`**: Hybrid delayed update (Scheme D). Two-phase approach: in the early phase (SCF not yet converged), lambda updates are gentle; in the late phase (SCF nearly converged), augmented Lagrangian updates are applied.
+
+### Direction-Only Mode
+
+When `sc_direction_only 1` is set, only the **direction** of the magnetic moment is constrained to match the target, while the magnitude is allowed to vary freely. This is useful for:
+
+- Studying magnetic anisotropy energy surfaces
+- Cases where the moment magnitude is determined by the electronic structure
+- Converging to the easy-axis direction without fixing the moment size
+
+In this mode, the lambda vector is projected to be perpendicular to the target moment direction at each iteration, ensuring it can only rotate the magnetization, not stretch it.
+
+### Combining DeltaSpin with DFT+U
+
+DeltaSpin can be combined with DFT+U for strongly correlated systems. When both `sc_mag_switch` and `dft_plus_u` are enabled:
+
+1. DFT+U occupation update runs first in each SCF iteration
+2. DeltaSpin lambda loop runs after, constraining the magnetic moments
+3. The DFT+U-corrected Hamiltonian is used by the lambda loop
+
+Example INPUT for PW DFT+U + DeltaSpin:
+
+```
+INPUT_PARAMETERS
+calculation         scf
+basis_type          pw
+ecutwfc             50
+nspin               2
+dft_plus_u          1
+orbital_corr        -1 2
+hubbard_u           0.0 4.0
+sc_mag_switch       1
+sc_thr              1.0e-6
+sc_scf_thr          1.0e-4
+sc_lambda_strategy  bfgs
+```
+
+### Example: Collinear antiferromagnetic Fe
+
+INPUT file:
+
+```
+INPUT_PARAMETERS
+calculation         scf
+basis_type          pw
+ecutwfc             50
+nspin               2
+sc_mag_switch       1
+sc_thr              1.0e-6
+```
+
+STRU file:
+
+```
+ATOMIC_SPECIES
+Fe 55.845 Fe.upf
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00  0.50  0.50
+ 0.50  1.00  0.50
+ 0.50  0.50  1.00
+
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0  sc 1
+0.51  0.51  0.51  mag  -2.0  sc 1
+```
+
+### Example: Non-collinear constrained moments
+
+INPUT file:
+
+```
+INPUT_PARAMETERS
+calculation         scf
+basis_type          pw
+ecutwfc             50
+nspin               4
+noncolin            1
+sc_mag_switch       1
+sc_direction_only   1
+sc_lambda_strategy  bfgs
+```
+
+STRU file:
+
+```
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0  0.0  0.0  sc 1 1 0
+0.51  0.51  0.51  mag  0.0  0.0  2.0  sc 1 1 0
+```
+
 ## Noncollinear Spin Polarized Calculations
 The spin non-collinear polarization calculation corresponds to setting **"noncolin 1"**, in which case the coupling between spin up and spin down will be taken into account.
 In this case, nspin is automatically set to 4, which is usually not required to be specified manually.
diff --git a/docs/parameters.yaml b/docs/parameters.yaml
index 63ee83376c5..0b34a9543da 100644
--- a/docs/parameters.yaml
+++ b/docs/parameters.yaml
@@ -4266,6 +4266,26 @@ parameters:
     default_value: "1.0e-4"
     unit: ""
     availability: sc_mag_switch is true
+  - name: sc_direction_only
+    category: Spin-Constrained DFT
+    type: Boolean
+    description: |
+      When true, only the direction of the magnetic moment is constrained to the target direction, while the magnitude is allowed to vary freely. This is useful for studying magnetic anisotropy or when the magnitude of the moment is determined by the electronic structure rather than an external constraint. When false (default), both the direction and magnitude of the magnetic moment are constrained to the target values.
+    default_value: "False"
+    unit: ""
+    availability: sc_mag_switch is true
+  - name: sc_lambda_strategy
+    category: Spin-Constrained DFT
+    type: String
+    description: |
+      Lambda update strategy for spin-constrained DFT. Available options are:
+      * bfgs: BFGS quasi-Newton method (default, robust and well-tested)
+      * linear_response: linear response method (Scheme B)
+      * augmented_lagrangian: augmented Lagrangian method (Scheme C)
+      * hybrid_delayed: hybrid delayed update (Scheme D)
+    default_value: "bfgs"
+    unit: ""
+    availability: sc_mag_switch is true
   - name: qo_switch
     category: Quasiatomic Orbital (QO) analysis
     type: Boolean
diff --git a/interfaces/Wannier90_interface/examples_python/example_pw.py b/interfaces/Wannier90_interface/examples_python/example_pw.py
index ff70d2e86eb..93fc0ef62a0 100644
--- a/interfaces/Wannier90_interface/examples_python/example_pw.py
+++ b/interfaces/Wannier90_interface/examples_python/example_pw.py
@@ -64,7 +64,7 @@ def main():
     # 3. Dependency files (PW only needs pseudopotentials)
     # ----------------------------------------------------------
     job.pp_orbitals = {"Bi": "../../../tests/PP_ORB/Bi_pbe_fr.upf", "Se": "../../../tests/PP_ORB/Se_pbe_fr.upf"}
-    # ← PW基组不需要轨道文件，不设置 orbital_files
+    # PW basis does not require orbital files, orbital_files not set
 
     # ----------------------------------------------------------
     # 4. Wannier90 Parameters
@@ -99,7 +99,7 @@ def main():
         if DRY_RUN:
             job._validate_inputs()
 
-            # Step 0: SCF (PW基组)
+            # Step 0: SCF (PW basis)
             job.step0_run_scf(
                 scf_mp_grid=[4, 4, 4],
             )
diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
index c5b0648c49b..062ebe0e765 100644
--- a/source/source_base/kernels/cuda/math_kernel_op.cu
+++ b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -314,6 +314,9 @@ void gemm_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const ch
 {
     cublasOperation_t cutransA = judge_trans_op(true, transa, "gemm_op");
     cublasOperation_t cutransB = judge_trans_op(true, transb, "gemm_op");
+    if (cublas_handle == nullptr) {
+        CHECK_CUBLAS(cublasCreate(&cublas_handle));
+    }
     CHECK_CUBLAS(cublasZgemm(cublas_handle, cutransA, cutransB, m, n ,k, (double2*)alpha, (double2*)a , lda, (double2*)b, ldb, (double2*)beta, (double2*)c, ldc));
 }
 
diff --git a/source/source_base/main.cpp b/source/source_base/main.cpp
index 9a32f11d289..ec5db9d3266 100644
--- a/source/source_base/main.cpp
+++ b/source/source_base/main.cpp
@@ -36,7 +36,7 @@ void calculate()
 /*
 	time_t time_start = std::time(NULL);
 
-//	ModuleBase::timer::start();
+//	ModuleBase::timer::tick();
 
 	//----------------------------------------------------------
 	// main program for doing electronic structure calculations
diff --git a/source/source_base/module_container/base/macros/cuda.h b/source/source_base/module_container/base/macros/cuda.h
index 572eecdffd0..521861664a6 100644
--- a/source/source_base/module_container/base/macros/cuda.h
+++ b/source/source_base/module_container/base/macros/cuda.h
@@ -67,11 +67,13 @@ struct GetTypeCuda<double>
 {
     static constexpr cudaDataType cuda_data_type = cudaDataType::CUDA_R_64F;
 };
+#if CUDA_VERSION >= 11000
 template <>
 struct GetTypeCuda<int64_t>
 {
     static constexpr cudaDataType cuda_data_type = cudaDataType::CUDA_R_64I;
 };
+#endif
 template <>
 struct GetTypeCuda<std::complex<float>>
 {
diff --git a/source/source_base/module_container/base/third_party/cusolver.h b/source/source_base/module_container/base/third_party/cusolver.h
index 529109823df..466011438d6 100644
--- a/source/source_base/module_container/base/third_party/cusolver.h
+++ b/source/source_base/module_container/base/third_party/cusolver.h
@@ -19,6 +19,8 @@
 namespace container {
 namespace cuSolverConnector {
 
+#if CUDA_VERSION >= 11000
+// Generic API (CUDA 11.0+)
 template <typename T>
 static inline
 void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, T* A, const int& lda)
@@ -37,7 +39,7 @@ void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& d
     int h_info = 0;
     int* d_info = nullptr;
     CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
-    // Perform Cholesky decomposition
+    // Perform triangular matrix inversion
     CHECK_CUSOLVER(cusolverDnXtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, GetTypeCuda<T>::cuda_data_type, reinterpret_cast<Type*>(A), n, d_work, d_lwork, h_work, h_lwork, d_info));
     CHECK_CUDA(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
     if (h_info != 0) {
@@ -47,6 +49,57 @@ void trtri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& d
     CHECK_CUDA(cudaFree(d_work));
     CHECK_CUDA(cudaFree(d_info));
 }
+#else
+// Legacy API fallback (CUDA < 11.0)
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, float* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnStrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, &lwork));
+    float* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(float)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnStrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, double* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnDtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, &lwork));
+    double* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(double)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnDtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, A, lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<float>* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnCtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuComplex*>(A), lda, &lwork));
+    cuComplex* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(cuComplex)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnCtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuComplex*>(A), lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+static inline void trtri(cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, std::complex<double>* A, const int& lda)
+{
+    int lwork = 0;
+    CHECK_CUSOLVER(cusolverDnZtrtri_bufferSize(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuDoubleComplex*>(A), lda, &lwork));
+    cuDoubleComplex* d_work = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_work, lwork * sizeof(cuDoubleComplex)));
+    int* d_info = nullptr;
+    CHECK_CUDA(cudaMalloc((void**)&d_info, sizeof(int)));
+    CHECK_CUSOLVER(cusolverDnZtrtri(cusolver_handle, cublas_fill_mode(uplo), cublas_diag_type(diag), n, reinterpret_cast<cuDoubleComplex*>(A), lda, d_work, lwork, d_info));
+    CHECK_CUDA(cudaFree(d_work));
+    CHECK_CUDA(cudaFree(d_info));
+}
+#endif
 
 static inline
 void potri (cusolverDnHandle_t& cusolver_handle, const char& uplo, const char& diag, const int& n, float * A, const int& lda)
@@ -1327,7 +1380,7 @@ static inline void geqrf(
         cusolver_handle, m, n,
         reinterpret_cast<cuComplex*>(d_A),
         lda,
-        &lwork  // ← 这里才是 lwork 的地址！
+        &lwork  // ← correct: pass address of lwork
     ));
 
     cuComplex* d_work = nullptr;
@@ -1342,7 +1395,7 @@ static inline void geqrf(
         cusolver_handle, m, n,
         reinterpret_cast<cuComplex*>(d_A),
         lda,
-        reinterpret_cast<cuComplex*>(d_tau),  // ← 这里才是 d_tau
+        reinterpret_cast<cuComplex*>(d_tau),  // ← correct: d_tau
         d_work, lwork, d_info));
 
     int h_info = 0;
diff --git a/source/source_base/module_device/device_check.h b/source/source_base/module_device/device_check.h
index 92ab5b4d5db..529df35cadd 100644
--- a/source/source_base/module_device/device_check.h
+++ b/source/source_base/module_device/device_check.h
@@ -67,6 +67,7 @@ static const char* _cusolverGetErrorString(cusolverStatus_t error)
         return "CUSOLVER_STATUS_ZERO_PIVOT";
     case CUSOLVER_STATUS_INVALID_LICENSE:
         return "CUSOLVER_STATUS_INVALID_LICENSE";
+#if CUDA_VERSION >= 11000
     case CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED:
         return "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED";
     case CUSOLVER_STATUS_IRS_PARAMS_INVALID:
@@ -93,6 +94,7 @@ static const char* _cusolverGetErrorString(cusolverStatus_t error)
         return "CUSOLVER_STATUS_IRS_MATRIX_SINGULAR";
     case CUSOLVER_STATUS_INVALID_WORKSPACE:
         return "CUSOLVER_STATUS_INVALID_WORKSPACE";
+#endif
     default:
         return "<unknown>";
     }
diff --git a/source/source_base/parallel_global.cpp b/source/source_base/parallel_global.cpp
index 67eaff4235c..038b2420500 100644
--- a/source/source_base/parallel_global.cpp
+++ b/source/source_base/parallel_global.cpp
@@ -236,15 +236,30 @@ void Parallel_Global::read_pal_param(int argc,
 #ifdef __MPI
 void Parallel_Global::finalize_mpi()
 {
-    MPI_Comm_free(&POOL_WORLD);
-    if (KP_WORLD != MPI_COMM_NULL)
+    if (POOL_WORLD != MPI_COMM_NULL && POOL_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&POOL_WORLD);
+    }
+    if (KP_WORLD != MPI_COMM_NULL && KP_WORLD != MPI_COMM_WORLD)
     {
         MPI_Comm_free(&KP_WORLD);
     }
-    MPI_Comm_free(&INT_BGROUP);
-    MPI_Comm_free(&BP_WORLD);
-    MPI_Comm_free(&GRID_WORLD);
-    MPI_Comm_free(&DIAG_WORLD);
+    if (INT_BGROUP != MPI_COMM_NULL && INT_BGROUP != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&INT_BGROUP);
+    }
+    if (BP_WORLD != MPI_COMM_NULL && BP_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&BP_WORLD);
+    }
+    if (GRID_WORLD != MPI_COMM_NULL && GRID_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&GRID_WORLD);
+    }
+    if (DIAG_WORLD != MPI_COMM_NULL && DIAG_WORLD != MPI_COMM_WORLD)
+    {
+        MPI_Comm_free(&DIAG_WORLD);
+    }
     MPI_Finalize();
 }
 #endif
diff --git a/source/source_basis/module_pw/pw_distributeg.cpp b/source/source_basis/module_pw/pw_distributeg.cpp
index ea026e88d41..317d6ad863b 100644
--- a/source/source_basis/module_pw/pw_distributeg.cpp
+++ b/source/source_basis/module_pw/pw_distributeg.cpp
@@ -25,9 +25,8 @@ void PW_Basis::distribute_g()
     {
         ModuleBase::WARNING_QUIT("divide", "No such division type.");
     }
-    const char* no_pw_message = "Current core has no plane waves! Please reduce the cores.";
     ModuleBase::CHECK_WARNING_QUIT((this->npw == 0), "pw_distributeg.cpp", PARAM.inp.calculation,
-                                   no_pw_message);
+    "Current core has no plane waves! Please reduce the cores.");
     ModuleBase::timer::end(this->classname, "distributeg");
     return;
 }
diff --git a/source/source_cell/read_atoms_helper.cpp b/source/source_cell/read_atoms_helper.cpp
index 4fc3dfe6cb0..38671fe5316 100644
--- a/source/source_cell/read_atoms_helper.cpp
+++ b/source/source_cell/read_atoms_helper.cpp
@@ -453,6 +453,9 @@ bool parse_atom_properties(std::ifstream& ifpos,
             atom.lambda[ia].x /= ModuleBase::Ry_to_eV;
             atom.lambda[ia].y /= ModuleBase::Ry_to_eV;
             atom.lambda[ia].z /= ModuleBase::Ry_to_eV;
+            std::cout << "[DS-DIAG] STRU parse: lambda[" << ia << "]=("
+                      << atom.lambda[ia].x << ", " << atom.lambda[ia].y << ", "
+                      << atom.lambda[ia].z << ") Ry/uB (converted from eV/uB)" << std::endl;
         }
         else if ( tmpid == "sc")
         {
diff --git a/source/source_cell/test/read_sep_test.cpp b/source/source_cell/test/read_sep_test.cpp
index 0bfada1a36d..484418c3e4b 100644
--- a/source/source_cell/test/read_sep_test.cpp
+++ b/source/source_cell/test/read_sep_test.cpp
@@ -69,7 +69,7 @@ TEST_F(ReadSepTest, PrintSep)
     if (GlobalV::MY_RANK == 0)
     {
 #endif
-        // 设置测试数据
+        // Set up test data
         read_sep->label = "F";
         read_sep->xc_type = "pbe";
         read_sep->orbital = "p";
@@ -78,13 +78,13 @@ TEST_F(ReadSepTest, PrintSep)
         read_sep->r = new double[2]{0.1, 0.2};
         read_sep->rv = new double[2]{1.0, 2.0};
 
-        // 测试打印功能
+        // Test print functionality
         std::ofstream ofs("test_sep.out");
         read_sep->print_sep_info(ofs);
         read_sep->print_sep_vsep(ofs);
         ofs.close();
 
-        // 验证输出文件
+        // Verify output file
         std::ifstream ifs("test_sep.out");
         std::string line;
         std::vector<std::string> lines;
diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp
index 9d3906d6ebf..66831ae85a7 100644
--- a/source/source_esolver/esolver_ks_lcao.cpp
+++ b/source/source_esolver/esolver_ks_lcao.cpp
@@ -398,7 +398,27 @@ void ESolver_KS_LCAO<TK, TR>::hamilt2rho_single(UnitCell& ucell, int istep, int
     bool skip_charge = PARAM.inp.calculation == "nscf" ? true : false;
 
     // 2) run the inner lambda loop to contrain atomic moments with the DeltaSpin method
-    bool skip_solve = run_deltaspin_lambda_loop_lcao<TK>(iter - 1, this->drho, PARAM.inp);
+    bool skip_solve = false;
+    if (PARAM.inp.sc_mag_switch)
+    {
+        spinconstrain::SpinConstrain<TK>& sc = spinconstrain::SpinConstrain<TK>::getScInstance();
+        if (PARAM.inp.sc_lambda_strategy == "linear_scan")
+        {
+            sc.run_lambda_linear_scan(iter - 1);
+            skip_solve = true;
+        }
+        else if (!sc.mag_converged() && this->drho > 0 && this->drho < PARAM.inp.sc_scf_thr)
+        {
+            sc.run_lambda_loop(iter - 1);
+            sc.set_mag_converged(true);
+            skip_solve = true;
+        }
+        else if (sc.mag_converged())
+        {
+            sc.run_lambda_loop(iter - 1);
+            skip_solve = true;
+        }
+    }
 
     // 3) run Hsolver
     if (!skip_solve)
@@ -407,6 +427,12 @@ void ESolver_KS_LCAO<TK, TR>::hamilt2rho_single(UnitCell& ucell, int istep, int
         hsolver_lcao_obj.solve(static_cast<hamilt::Hamilt<TK>*>(this->p_hamilt), this->psi[0], this->pelec, *this->dmat.dm, 
           this->chr, PARAM.inp.nspin, skip_charge);
     }
+    else
+    {
+        // Lambda loop updated the density matrix (DM) but not the real-space charge density.
+        // HSolver was skipped, so we need to sync rho from DM manually.
+        LCAO_domain::dm2rho(this->dmat.dm->get_DMR_vector(), PARAM.inp.nspin, &this->chr);
+    }
 
     // 4) EXX
 #ifdef __EXX
diff --git a/source/source_esolver/esolver_ks_pw.cpp b/source/source_esolver/esolver_ks_pw.cpp
index 6714821d02f..9932155ac63 100644
--- a/source/source_esolver/esolver_ks_pw.cpp
+++ b/source/source_esolver/esolver_ks_pw.cpp
@@ -189,7 +189,7 @@ void ESolver_KS_PW<T, Device>::iter_init(UnitCell& ucell, const int istep, const
 
     // update local occupations for DFT+U
     // should before lambda loop in DeltaSpin
-    pw::iter_init_dftu_pw(iter, istep, this->dftu, this->stp.template get_psi_t<T, Device>(), this->pelec->wg, ucell, PARAM.inp);
+    pw::iter_init_dftu_pw(iter, istep, this->dftu, this->stp.template get_psi_t<T, Device>(), this->pelec->wg, ucell, this->p_chgmix, this->kv.isk.data());
 }
 
 // Temporary, it should be replaced by hsolver later.
diff --git a/source/source_esolver/esolver_sdft_pw.cpp b/source/source_esolver/esolver_sdft_pw.cpp
index 02300eb3c58..fbe2c1b24ad 100644
--- a/source/source_esolver/esolver_sdft_pw.cpp
+++ b/source/source_esolver/esolver_sdft_pw.cpp
@@ -157,8 +157,8 @@ void ESolver_SDFT_PW<T, Device>::hamilt2rho_single(UnitCell& ucell, int istep, i
                                                            this->p_hamilt_sto,
                                                            PARAM.inp.calculation,
                                                            PARAM.inp.basis_type,
-                                                           PARAM.inp.ks_solver,
-                                                           PARAM.globalv.use_uspp,
+                                                            PARAM.inp.ks_solver,
+                                                            PARAM.globalv.use_uspp,
                                                            PARAM.inp.nspin,
                                                            hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
                                                            hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
diff --git a/source/source_esolver/lcao_others.cpp b/source/source_esolver/lcao_others.cpp
index b3ad0c71499..62aadebe130 100644
--- a/source/source_esolver/lcao_others.cpp
+++ b/source/source_esolver/lcao_others.cpp
@@ -156,6 +156,7 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
                    PARAM.inp.sccut,
                    PARAM.inp.sc_drop_thr,
                    ucell,
+                   PARAM.inp.sc_direction_only,
                    &(this->pv),
                    PARAM.inp.nspin,
                    this->kv,
diff --git a/source/source_estate/elecstate_lcao.h b/source/source_estate/elecstate_lcao.h
index bf1f11e1f7e..1e7cafbfa62 100644
--- a/source/source_estate/elecstate_lcao.h
+++ b/source/source_estate/elecstate_lcao.h
@@ -3,6 +3,8 @@
 
 #include "elecstate.h"
 #include "source_estate/module_dm/density_matrix.h"
+#include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_cell/klist.h"
 
 #include <vector>
 
@@ -26,11 +28,21 @@ class ElecStateLCAO : public ElecState
 
     virtual ~ElecStateLCAO()
     {
+        if (this->DM != nullptr)
+        {
+            delete this->DM;
+        }
     }
 
     // update charge density for next scf step
     // void getNewRho() override;
 
+    // initial density matrix
+    void init_DM(const K_Vectors* kv, const Parallel_Orbitals* paraV, const int nspin);
+    DensityMatrix<TK, double>* get_DM() const
+    {
+        return const_cast<DensityMatrix<TK, double>*>(this->DM);
+    }
     static int out_wfc_lcao;
     static bool need_psi_grid;
 
@@ -48,6 +60,9 @@ class ElecStateLCAO : public ElecState
 			std::vector<TK*> pexsi_EDM, 
 			DensityMatrix<TK, double>* dm);
 
+  private:
+    DensityMatrix<TK, double>* DM = nullptr;
+
 };
 
 template <typename TK>
@@ -56,6 +71,17 @@ int ElecStateLCAO<TK>::out_wfc_lcao = 0;
 template <typename TK>
 bool ElecStateLCAO<TK>::need_psi_grid = true;
 
+// init_DM implementation
+template <typename TK>
+void ElecStateLCAO<TK>::init_DM(const K_Vectors* kv, const Parallel_Orbitals* paraV, const int nspin)
+{
+    if (this->DM != nullptr)
+    {
+        delete this->DM;
+    }
+    this->DM = new DensityMatrix<TK, double>(paraV, nspin);
+}
+
 } // namespace elecstate
 
 #endif
diff --git a/source/source_estate/elecstate_pw.cpp b/source/source_estate/elecstate_pw.cpp
index de05d441b58..0c7bdbd3817 100644
--- a/source/source_estate/elecstate_pw.cpp
+++ b/source/source_estate/elecstate_pw.cpp
@@ -307,7 +307,7 @@ void ElecStatePW<T, Device>::cal_becsum(const psi::Psi<T, Device>& psi)
                       this->ppcell->nkb,
                       &one,
                       this->vkb,
-                      this->ppcell->vkb.nc,
+                      this->ppcell->vkbnc,
                       psi_now,
                       inc,
                       &zero,
@@ -323,7 +323,7 @@ void ElecStatePW<T, Device>::cal_becsum(const psi::Psi<T, Device>& psi)
                       npw,
                       &one,
                       this->vkb,
-                      this->ppcell->vkb.nc,
+                      this->ppcell->vkbnc,
                       psi_now,
                       npwx,
                       &zero,
diff --git a/source/source_estate/module_charge/charge_mixing.cpp b/source/source_estate/module_charge/charge_mixing.cpp
index 921d102502c..a91cc1b39fa 100644
--- a/source/source_estate/module_charge/charge_mixing.cpp
+++ b/source/source_estate/module_charge/charge_mixing.cpp
@@ -257,3 +257,34 @@ bool Charge_Mixing::if_scf_oscillate(const int iteration, const double drho, con
 
     return false;
 }
+
+void Charge_Mixing::allocate_mixing_uom(int uom_size)
+{
+    ModuleBase::TITLE("Charge_Mixing", "allocate_mixing_uom");
+    ModuleBase::timer::start("Charge_Mixing", "allocate_mixing_uom");
+    ModuleBase::timer::end("Charge_Mixing", "allocate_mixing_uom");
+    // For nspin=2, uom_size already includes both spin channels
+    // (eff_pot_pw.size() = pot_index * 2 for nspin=2)
+    // So uom_fold should always be 1
+    this->mixing->init_mixing_data(this->uom_mdata, uom_size, sizeof(double));
+    this->uom_mdata.reset();
+    ModuleBase::timer::start("Charge_Mixing", "allocate_mixing_uom");
+    ModuleBase::timer::end("Charge_Mixing", "allocate_mixing_uom");
+    return;
+}
+
+void Charge_Mixing::mix_uom(std::vector<double>& uom_in, std::vector<double>& uom_save_in)
+{
+    ModuleBase::TITLE("Charge_Mixing", "mix_uom");
+    ModuleBase::timer::start("Charge_Mixing", "mix_uom");
+    ModuleBase::timer::end("Charge_Mixing", "mix_uom");
+    double* uom_value_out = uom_in.data();
+    double* uom_value_in = uom_save_in.data();
+    // For all nspin cases, uom_array layout is already fully sized
+    // and mixing operates on the entire array
+    this->mixing->push_data(this->uom_mdata, uom_value_in, uom_value_out, nullptr, false);
+    this->mixing->mix_data(this->uom_mdata, uom_value_out);
+    ModuleBase::timer::start("Charge_Mixing", "mix_uom");
+    ModuleBase::timer::end("Charge_Mixing", "mix_uom");
+    return;
+}
diff --git a/source/source_estate/module_charge/charge_mixing.h b/source/source_estate/module_charge/charge_mixing.h
index 3152dc5e204..c24a866df91 100644
--- a/source/source_estate/module_charge/charge_mixing.h
+++ b/source/source_estate/module_charge/charge_mixing.h
@@ -50,6 +50,7 @@ class Charge_Mixing
                     double& tpiba_in);
 
     void close_kerker_gg0() { mixing_gg0 = 0.0; mixing_gg0_mag = 0.0; }
+    void conserve_setting() { mixing_beta = 0.01; mixing_beta_mag = 0.04; }
     /**
      * @brief initialize mixing, including constructing mixing and allocating memory for mixing data
      * @brief this function should be called at eachiterinit()
@@ -74,7 +75,20 @@ class Charge_Mixing
      */
     void mix_dmr(elecstate::DensityMatrix<double, double>* DM);
     void mix_dmr(elecstate::DensityMatrix<std::complex<double>, double>* DM);
-    
+
+    /**
+     * @brief allocate memory of uom_mdata
+     * @param uom_size size of DFT+U occupation matrix
+     */
+    void allocate_mixing_uom(int size_uom);
+
+    /**
+     * @brief DFT+U occupation matrix mixing
+     * @param uom_in output occupation matrix
+     * @param uom_save_in input occupation matrix
+     */
+    void mix_uom(std::vector<double>& uom_in, std::vector<double>& uom_save_in);
+
     /**
      * @brief Get the drho between rho and rho_save, similar for get_dkin
      *
@@ -118,6 +132,7 @@ class Charge_Mixing
     Base_Mixing::Mixing_Data tau_mdata;    ///< Mixing data for kinetic energy density
     Base_Mixing::Mixing_Data nhat_mdata;   ///< Mixing data for compensation density
     Base_Mixing::Mixing_Data dmr_mdata;    ///< Mixing data for real space density matrix
+    Base_Mixing::Mixing_Data uom_mdata;    ///< Mixing data for DFT+U occupation matrix
     Base_Mixing::Plain_Mixing* mixing_highf = nullptr; ///< The high_frequency part is mixed by plain mixing method.
 
     //======================================
diff --git a/source/source_estate/module_charge/chgmixing.cpp b/source/source_estate/module_charge/chgmixing.cpp
index 45e5c5b350c..1fd48fac5d3 100644
--- a/source/source_estate/module_charge/chgmixing.cpp
+++ b/source/source_estate/module_charge/chgmixing.cpp
@@ -128,6 +128,13 @@ void module_charge::chgmixing_ks_pw(const int iter, // scf iteration number
     {
         p_chgmix->init_mixing();
         p_chgmix->mixing_restart_step = inp.scf_nmax + 1;
+        if (inp.dft_plus_u && inp.mixing_dftu)
+        {
+            // enable mixing_dftu for DFT+U occupation mixing
+            dftu.enable_mixing();
+            // allocate memory for uom_mdata
+            p_chgmix->allocate_mixing_uom(dftu.get_size_eff_pot_pw());
+        }
     }
 
     // For mixing restart
@@ -158,9 +165,9 @@ void module_charge::chgmixing_ks_pw(const int iter, // scf iteration number
 				{
 					dftu.uramping_update(); // update U by uramping if uramping > 0.01
 					std::cout << " U-Ramping! Current U = ";
-					for (int i = 0; i < dftu.U0.size(); i++)
+					for (int i = 0; i < dftu.get_num_u_types(); i++)
 					{
-						std::cout << dftu.U[i] * ModuleBase::Ry_to_eV << " ";
+						std::cout << dftu.get_hubbard_u(i) * ModuleBase::Ry_to_eV << " ";
 					}
 					std::cout << " eV " << std::endl;
 				}
@@ -184,13 +191,18 @@ void module_charge::chgmixing_ks_lcao(const int iter, // scf iteration number
         p_chgmix->mix_reset(); // init mixing
         p_chgmix->mixing_restart_step = inp.scf_nmax + 1;
         p_chgmix->mixing_restart_count = 0;
+        // enable mixing_dftu for DFT+U occupation mixing
+        if (inp.dft_plus_u && inp.mixing_dftu)
+        {
+            dftu.enable_mixing();
+        }
         // this output will be removed once the feeature is stable
         if (dftu.uramping > 0.01)
         {
             std::cout << " U-Ramping! Current U = ";
-            for (int i = 0; i < dftu.U0.size(); i++)
+            for (int i = 0; i < dftu.get_num_u_types(); i++)
             {
-                std::cout << dftu.U[i] * ModuleBase::Ry_to_eV << " ";
+                std::cout << dftu.get_hubbard_u(i) * ModuleBase::Ry_to_eV << " ";
             }
             std::cout << " eV " << std::endl;
         }
@@ -207,9 +219,9 @@ void module_charge::chgmixing_ks_lcao(const int iter, // scf iteration number
             if (dftu.uramping > 0.01)
             {
                 std::cout << " U-Ramping! Current U = ";
-                for (int i = 0; i < dftu.U0.size(); i++)
+                for (int i = 0; i < dftu.get_num_u_types(); i++)
                 {
-                    std::cout << dftu.U[i] * ModuleBase::Ry_to_eV << " ";
+                    std::cout << dftu.get_hubbard_u(i) * ModuleBase::Ry_to_eV << " ";
                 }
                 std::cout << " eV " << std::endl;
             }
diff --git a/source/source_hsolver/hsolver_lcao.cpp b/source/source_hsolver/hsolver_lcao.cpp
index b1c7ba9c95e..5b65a523343 100644
--- a/source/source_hsolver/hsolver_lcao.cpp
+++ b/source/source_hsolver/hsolver_lcao.cpp
@@ -161,7 +161,6 @@ void HSolverLCAO<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T>* hm, psi::Psi<T>&
 #ifdef __CUDA
     else if (this->method == "cusolver")
     {
-        // Note: This branch will only be executed in the single-process case
         DiagoCusolver<T> cu;
         hamilt::MatrixBlock<T> hk, sk;
         hm->matrix(hk, sk);
diff --git a/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu b/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu
index c53139897f0..345d415ac05 100644
--- a/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu
+++ b/source/source_hsolver/kernels/cuda/diag_cusolvermp.cu
@@ -58,18 +58,18 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
                                                  const int nacols,
                                                  const int* desc)
 {
-    // 构造函数的实现
+    // Constructor implementation
     this->cblacs_ctxt = desc[1];
     this->nFull = desc[2];
 
     // 20240529 zhanghaochong
-    // set mb and nb is not nessary, but I keep it here for the code consistency.
-    // Because in ABACUS, mb always equals to nb
+    // Setting mb and nb is not necessary, but kept for code consistency.
+    // In ABACUS, mb always equals nb.
     const int mb = desc[4];
     const int nb = desc[5];
 
     // 20240529 zhanghaochong
-    // so far, cusolverMpSygvd only support rsrc == 0 and csrc == 0
+    // So far, cusolverMpSygvd only supports rsrc == 0 and csrc == 0
     const int rsrc = desc[6];
     const int csrc = desc[7];
 
@@ -106,7 +106,7 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
     CHECK_CUSOLVER(cusolverMpCreate(&cusolverMpHandle, local_device_id, this->localStream));
 
     // 20240529 zhanghaochong
-    // so far, cusolvermp only support = 1
+    // So far, cusolverMp only supports matrix_i = 1
     this->matrix_i = 1;
     this->matrix_j = 1;
     this->m_local = narows;
@@ -125,11 +125,11 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
     }
 
     // 20240529 zhanghaochong
-    // the cpu mpi process blacs grid and multi gpu process blacs grid is the SAME
-    // Setting them the same is not a natural result, but a result that I forced and artificially specified.
-    // This is because the current implementation of the cusolvermp library is ONE process ONE GPU.
-    // So, when we use cusolvermp, we must ensure that the number of processes is equal to the number of GPUs.
-    // In a sense, the MPI usage strategy of ABACUS must be subject to the cusolvermp.
+    // The CPU MPI process BLACS grid and multi-GPU process BLACS grid are the SAME.
+    // Setting them the same is not a natural result, but artificially enforced.
+    // This is because the current cusolverMp library implementation uses one process per GPU.
+    // Therefore, when using cusolverMp, the number of processes must equal the number of GPUs.
+    // The MPI usage strategy in ABACUS must conform to cusolverMp requirements.
     // Use ROW_MAJOR to match BLACS grid initialization (order='R' in parallel_2d.cpp)
     CHECK_CUSOLVER(cusolverMpCreateDeviceGrid(cusolverMpHandle,
                                                    &this->grid,
@@ -143,9 +143,9 @@ Diag_CusolverMP_gvd<inputT>::Diag_CusolverMP_gvd(const MPI_Comm mpi_comm,
                                                    CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
 
     // 20240529 zhanghaochong
-    // Actually, there should be three matrix descriptors, A matrix, B matrix, and output eigenvector matrix.
-    // But in ABACUS the three matrices descriptors are the same.
-    // So, I only create one matrix descriptor and use it for the three matrices.
+    // There should be three matrix descriptors: A matrix, B matrix, and output eigenvector matrix.
+    // However, in ABACUS all three matrix descriptors are identical.
+    // Therefore, only one matrix descriptor is created and reused for all three matrices.
     CHECK_CUSOLVER(cusolverMpCreateMatrixDesc(&this->desc_for_cusolvermp,
                                this->grid,
                                this->datatype,
@@ -282,12 +282,11 @@ int Diag_CusolverMP_gvd<inputT>::generalized_eigenvector(inputT* A, inputT* B, o
                                this->n_local * this->m_local * sizeof(inputT),
                                cudaMemcpyDeviceToHost));
     // 20240529 zhanghaochong
-    // I move the free operations from destructor to here.
-    // Because I think it is more reasonable to free the memory in the function where it is allocated.
-    // Destructor is used to release resources that allocated in the constructor.
-    // And currently, we construct and destruct the object in every SCF iteration. Maybe one day we
-    // will construct the object only once during the whole program life cycle.
-    // In that case, allocate and free memory in compute function is more reasonable.
+    // Memory deallocation moved from destructor to here for better resource management.
+    // The destructor should release resources allocated in the constructor.
+    // Currently, the object is constructed and destructed every SCF iteration.
+    // In the future, the object may be constructed once during the entire program lifecycle.
+    // Allocating and freeing memory in the compute function is more appropriate in that case.
     CHECK_CUDA(cudaFree(d_A));
     CHECK_CUDA(cudaFree(d_B));
     CHECK_CUDA(cudaFree(d_D));
diff --git a/source/source_io/module_output/print_info.cpp b/source/source_io/module_output/print_info.cpp
index 398cbb49a8f..b76e7631fa9 100644
--- a/source/source_io/module_output/print_info.cpp
+++ b/source/source_io/module_output/print_info.cpp
@@ -85,7 +85,7 @@ void print_parameters(
 
         const bool orbinfo = (inp.basis_type=="lcao" || inp.basis_type=="lcao_in_pw" 
               || (inp.basis_type=="pw" && inp.init_wfc.substr(0, 3) == "nao"));
-
+        if (orbinfo) { std::cout << std::setw(12) << "NBASE"; }
 
         std::cout << std::endl;
         std::cout << " " << std::setw(8) << inp.nspin;
@@ -103,8 +103,13 @@ void print_parameters(
              << std::setw(14) << PARAM.globalv.nthread_per_proc
              << std::setw(14) << PARAM.globalv.nthread_per_proc*GlobalV::NPROC;
 
+        if (orbinfo) { std::cout << std::setw(12) << PARAM.globalv.nlocal; }
+
         std::cout << std::endl;
 
+
+
+
         std::cout << " ----------------------------------------------------------------" << std::endl;
         if(inp.basis_type == "lcao")
         {
@@ -120,13 +125,11 @@ void print_parameters(
         }
         std::cout << " ----------------------------------------------------------------" << std::endl;
 
+
+
         //----------------------------------
         // second part
         //----------------------------------
-        if (orbinfo) 
-        { 
-            std::cout << " TOTAL NBASE" << " " << PARAM.globalv.nlocal << std::endl;
-        }
 
         std::cout << " " << std::setw(8) << "ELEMENT";
 
@@ -137,6 +140,7 @@ void print_parameters(
         }
         std::cout << std::setw(12) << "NATOM";
 
+        std::cout << std::setw(12) << "XC";
         std::cout << std::endl;
 
 
diff --git a/source/source_io/module_parameter/input_parameter.h b/source/source_io/module_parameter/input_parameter.h
index 029ad364eb5..f34a9521793 100644
--- a/source/source_io/module_parameter/input_parameter.h
+++ b/source/source_io/module_parameter/input_parameter.h
@@ -597,11 +597,16 @@ struct Input_para
     double sc_thr = 1e-06;          ///< threshold for spin-constrained DFT in uB
     int nsc = 100;                  ///< maximum number of inner lambda loop
     int nsc_min = 2;                ///< minimum number of inner lambda loop
-    int sc_scf_nmin = 2;            ///< minimum number of outer scf loop before initial lambda loop
     double alpha_trial = 0.01;      ///< initial trial step size for lambda in eV/uB^2
     double sccut = 3.0;             ///< restriction of step size in eV/uB
     double sc_scf_thr = 1e-3;       ///< minimum number of outer scf loop before initial lambda loop
     double sc_drop_thr = 1e-3;      ///< threshold for lambda-loop threshold cutoff in spin-constrained DFT
+    std::string sc_lambda_strategy = "bfgs";  ///< lambda update strategy: bfgs, bfgs2, linear_response, augmented_lagrangian, hybrid_delayed, linear_scan
+    bool sc_direction_only = false; ///< only optimize the direction of magnetization
+    // linear_scan parameters
+    double sc_scan_lambda_start = 0.0;  ///< start value for lambda scan (eV/uB)
+    double sc_scan_lambda_end = 1.0;    ///< end value for lambda scan (eV/uB)
+    int sc_scan_steps = 20;             ///< number of steps in lambda scan
 
     // ==============   #Parameters (18.Quasiatomic Orbital analysis) =========
     ///<==========================================================
diff --git a/source/source_io/module_parameter/read_input_item_elec_stru.cpp b/source/source_io/module_parameter/read_input_item_elec_stru.cpp
index 39f37febc54..0fe7ad35aa8 100644
--- a/source/source_io/module_parameter/read_input_item_elec_stru.cpp
+++ b/source/source_io/module_parameter/read_input_item_elec_stru.cpp
@@ -831,7 +831,7 @@ Note: If gamma_only is set to 1, the KPT file will be overwritten. So make sure
         item.annotation = "charge density error";
         item.category = "Electronic structure";
         item.type = "Real";
-        item.description = "It's the density threshold for electronic iteration. It represents the charge density error between two sequential densities from electronic iterations. This criterion is always enabled. If scf_ene_thr is set, the total-energy criterion (scf_ene_thr) is additionally checked only after the first SCF iteration and only when the charge-density criterion (scf_thr) has already been satisfied. For local-orbital calculations, 1e-6 is usually accurate enough.";
+        item.description = "It's the density threshold for electronic iteration. It represents the charge density error between two sequential densities from electronic iterations. Usually for local orbitals, usually 1e-6 may be accurate enough.";
         item.default_value = "1.0e-9 (plane-wave basis), or 1.0e-7 (localized atomic orbital basis).";
         item.unit = "Ry if scf_thr_type=1, dimensionless if scf_thr_type=2";
         item.availability = "";
@@ -865,7 +865,7 @@ Note: If gamma_only is set to 1, the KPT file will be overwritten. So make sure
         item.annotation = "total energy error threshold";
         item.category = "Electronic structure";
         item.type = "Real";
-        item.description = "It's the energy threshold for electronic iteration. The compared quantity is the total-energy difference evaluated from the charge densities before and after the Hpsi operation in one SCF step. It is not the same as the screen-output EDIFF, which is the energy difference before Hpsi and after charge mixing (i.e., across both Hpsi and charge-mixing operations).";
+        item.description = "It's the energy threshold for electronic iteration. It represents the total energy error between two sequential densities from electronic iterations.";
         item.default_value = "-1.0. If the user does not set this parameter, it will not take effect.";
         item.unit = "eV";
         item.availability = "";
diff --git a/source/source_io/module_parameter/read_input_item_exx_dftu.cpp b/source/source_io/module_parameter/read_input_item_exx_dftu.cpp
index 8daa6224b8c..4afec198309 100644
--- a/source/source_io/module_parameter/read_input_item_exx_dftu.cpp
+++ b/source/source_io/module_parameter/read_input_item_exx_dftu.cpp
@@ -643,9 +643,9 @@ void ReadInput::item_dftu()
             const Input_para& input = para.input;
             if (input.dft_plus_u != 0)
             {
-                if (input.basis_type == "pw" && input.nspin != 4)
+                if (input.basis_type == "pw" && input.nspin != 4 && input.nspin != 2 && input.nspin != 1)
                 {
-                    ModuleBase::WARNING_QUIT("ReadInput", "WRONG ARGUMENTS, only nspin2 with PW base is not supported now");
+                    ModuleBase::WARNING_QUIT("ReadInput", "WRONG ARGUMENTS, DFT+U with PW base only supports nspin=1/2/4");
                 }
             }
         };
diff --git a/source/source_io/module_parameter/read_input_item_other.cpp b/source/source_io/module_parameter/read_input_item_other.cpp
index d929b0ee7f5..387d7d464d7 100644
--- a/source/source_io/module_parameter/read_input_item_other.cpp
+++ b/source/source_io/module_parameter/read_input_item_other.cpp
@@ -117,25 +117,6 @@ void ReadInput::item_others()
         };
         this->add_item(item);
     }
-    {
-        Input_Item item("sc_scf_nmin");
-        item.annotation = "Minimum number of outer scf loop before "
-                          "initializing lambda loop";
-        item.category = "Spin-Constrained DFT";
-        item.type = "Integer";
-        item.description = "Minimum number of outer scf loop before initializing lambda loop";
-        item.default_value = "2";
-        item.unit = "";
-        item.availability = "sc_mag_switch is true";
-        read_sync_int(input.sc_scf_nmin);
-        item.check_value = [](const Input_Item& item, const Parameter& para) {
-            if (para.input.sc_scf_nmin < 2)
-            {
-                ModuleBase::WARNING_QUIT("ReadInput", "sc_scf_nmin must >= 2");
-            }
-        };
-        this->add_item(item);
-    }
     {
         Input_Item item("alpha_trial");
         item.annotation = "Initial trial step size for lambda in eV/uB^2";
@@ -202,6 +183,80 @@ void ReadInput::item_others()
         };
         this->add_item(item);
     }
+    {
+        Input_Item item("sc_direction_only");
+        item.annotation = "only optimize the direction of magnetization";
+        item.category = "Spin-Constrained DFT";
+        item.type = "Boolean";
+        item.description = R"(When true, only the direction of the magnetic moment is constrained to the target direction, while the magnitude is allowed to vary freely. This is useful for studying magnetic anisotropy or when the magnitude of the moment is determined by the electronic structure rather than an external constraint.
+
+When false (default), both the direction and magnitude of the magnetic moment are constrained to the target values.)";
+        item.default_value = "False";
+        item.unit = "";
+        item.availability = "sc_mag_switch is true";
+        read_sync_bool(input.sc_direction_only);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("sc_lambda_strategy");
+        item.annotation = "lambda update strategy for spin-constrained DFT";
+        item.category = "Spin-Constrained DFT";
+        item.type = "String";
+        item.description = R"(Lambda update strategy for spin-constrained DFT:
+* bfgs: BFGS quasi-Newton method
+* linear_response: linear response (Scheme B)
+* augmented_lagrangian: augmented Lagrangian (Scheme C)
+* hybrid_delayed: hybrid delayed update (Scheme D)
+* linear_scan: linear sweep of lambda for testing magnetic moment response)";
+        item.default_value = "bfgs";
+        item.unit = "";
+        item.availability = "sc_mag_switch is true";
+        read_sync_string(input.sc_lambda_strategy);
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+            const std::vector<std::string> valid = {"bfgs", "bfgs2", "linear_response", "augmented_lagrangian", "hybrid_delayed", "linear_scan"};
+            if (std::find(valid.begin(), valid.end(), para.input.sc_lambda_strategy) == valid.end())
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "sc_lambda_strategy must be bfgs, bfgs2, linear_response, augmented_lagrangian, hybrid_delayed, or linear_scan");
+            }
+        };
+        this->add_item(item);
+    }
+    {
+        Input_Item item("sc_scan_lambda_start");
+        item.annotation = "start value for linear lambda scan (eV/uB)";
+        item.category = "Spin-Constrained DFT";
+        item.type = "Float";
+        item.description = "Starting lambda value for linear_scan strategy. Only used when sc_lambda_strategy=linear_scan.";
+        item.default_value = "0.0";
+        item.unit = "eV/uB";
+        item.availability = "sc_lambda_strategy is linear_scan";
+        read_sync_double(input.sc_scan_lambda_start);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("sc_scan_lambda_end");
+        item.annotation = "end value for linear lambda scan (eV/uB)";
+        item.category = "Spin-Constrained DFT";
+        item.type = "Float";
+        item.description = "Ending lambda value for linear_scan strategy. Only used when sc_lambda_strategy=linear_scan.";
+        item.default_value = "1.0";
+        item.unit = "eV/uB";
+        item.availability = "sc_lambda_strategy is linear_scan";
+        read_sync_double(input.sc_scan_lambda_end);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("sc_scan_steps");
+        item.annotation = "number of steps for linear lambda scan";
+        item.category = "Spin-Constrained DFT";
+        item.type = "Integer";
+        item.description = "Number of lambda values to scan. Only used when sc_lambda_strategy=linear_scan.";
+        item.default_value = "20";
+        item.unit = "";
+        item.availability = "sc_lambda_strategy is linear_scan";
+        read_sync_int(input.sc_scan_steps);
+        this->add_item(item);
+    }
 
     // Quasiatomic Orbital analysis
     {
diff --git a/source/source_io/module_unk/berryphase.cpp b/source/source_io/module_unk/berryphase.cpp
index a41ef3f6bd0..8ca67fb0441 100644
--- a/source/source_io/module_unk/berryphase.cpp
+++ b/source/source_io/module_unk/berryphase.cpp
@@ -113,7 +113,7 @@ void berryphase::set_kpoints(const K_Vectors& kv, const int direction)
 
         nppstr = mp_x + 1;
     }
-    else if (direction == 2) // 计算y方向
+    else if (direction == 2) // y-direction calculation
     {
         const int num_string = mp_x * mp_z;
 
@@ -163,7 +163,7 @@ void berryphase::set_kpoints(const K_Vectors& kv, const int direction)
 
         nppstr = mp_y + 1;
     }
-    else if (direction == 3) // 计算z方向
+    else if (direction == 3) // z-direction calculation
     {
         const int num_string = mp_x * mp_y;
 
diff --git a/source/source_io/test/read_input_ptest.cpp b/source/source_io/test/read_input_ptest.cpp
index 78ce87c91a0..615757b112d 100644
--- a/source/source_io/test/read_input_ptest.cpp
+++ b/source/source_io/test/read_input_ptest.cpp
@@ -431,7 +431,6 @@ TEST_F(InputParaTest, ParaRead)
     EXPECT_DOUBLE_EQ(param.inp.sc_thr, 1e-4);
     EXPECT_EQ(param.inp.nsc, 50);
     EXPECT_EQ(param.inp.nsc_min, 4);
-    EXPECT_EQ(param.inp.sc_scf_nmin, 4);
     EXPECT_DOUBLE_EQ(param.inp.alpha_trial, 0.02);
     EXPECT_DOUBLE_EQ(param.inp.sccut, 4.0);
     EXPECT_EQ(param.inp.sc_scf_thr, 1e-3);
diff --git a/source/source_io/test/support/INPUT b/source/source_io/test/support/INPUT
index b79b3517625..799c2e7a318 100644
--- a/source/source_io/test/support/INPUT
+++ b/source/source_io/test/support/INPUT
@@ -389,6 +389,5 @@ decay_grad_switch              1 #
 sc_thr                         1e-04 #Convergence criterion of spin-constrained iteration (RMS) in uB
 nsc                            50 #Maximal number of spin-constrained iteration
 nsc_min                        4 #Minimum number of spin-constrained iteration
-sc_scf_nmin                    4 #Minimum number of outer scf loop before initializing lambda loop
 alpha_trial                    0.02 #Initial trial step size for lambda in eV/uB^2
 sccut                          4 #Maximal step size for lambda in eV/uB
diff --git a/source/source_io/test_serial/read_input_item_test.cpp b/source/source_io/test_serial/read_input_item_test.cpp
index 41e8ba55c83..d23d152f1fd 100644
--- a/source/source_io/test_serial/read_input_item_test.cpp
+++ b/source/source_io/test_serial/read_input_item_test.cpp
@@ -1659,14 +1659,6 @@ TEST_F(InputTest, Item_test2)
         output = testing::internal::GetCapturedStdout();
         EXPECT_THAT(output, testing::HasSubstr("NOTICE"));
     }
-    { // sc_scf_nmin
-        auto it = find_label("sc_scf_nmin", readinput.input_lists);
-        param.input.sc_scf_nmin = 1;
-        testing::internal::CaptureStdout();
-        EXPECT_EXIT(it->second.check_value(it->second, param), ::testing::ExitedWithCode(1), "");
-        output = testing::internal::GetCapturedStdout();
-        EXPECT_THAT(output, testing::HasSubstr("NOTICE"));
-    }
     { // alpha_trial
         auto it = find_label("alpha_trial", readinput.input_lists);
         param.input.alpha_trial = -1;
diff --git a/source/source_lcao/dftu_lcao.cpp b/source/source_lcao/dftu_lcao.cpp
index 5a4c6c45c88..d8b8421d6e7 100644
--- a/source/source_lcao/dftu_lcao.cpp
+++ b/source/source_lcao/dftu_lcao.cpp
@@ -68,7 +68,7 @@ void finish_dftu_lcao(const int iter,
     /// use the converged occupation matrix for next MD/Relax SCF calculation
     if (conv_esolver)
     {
-        dftu_ptr->initialed_locale = true;
+        dftu_ptr->mark_locale_initialized();
     }
 }
 
diff --git a/source/source_lcao/hamilt_lcao.cpp b/source/source_lcao/hamilt_lcao.cpp
index 276db964696..53f394b9c02 100644
--- a/source/source_lcao/hamilt_lcao.cpp
+++ b/source/source_lcao/hamilt_lcao.cpp
@@ -524,6 +524,7 @@ void HamiltLCAO<TK, TR>::updateHk(const int ik)
             }
         }
         this->current_spin = this->kv->isk[ik];
+        dynamic_cast<hamilt::OperatorLCAO<TK, TR>*>(this->ops)->set_current_spin(this->kv->isk[ik]);
     }
     this->getOperator()->init(ik);
     ModuleBase::timer::end("HamiltLCAO", "updateHk");
diff --git a/source/source_lcao/module_deltaspin/CMakeLists.txt b/source/source_lcao/module_deltaspin/CMakeLists.txt
index 6a0c1fea22f..33e667a7e2a 100644
--- a/source/source_lcao/module_deltaspin/CMakeLists.txt
+++ b/source/source_lcao/module_deltaspin/CMakeLists.txt
@@ -8,6 +8,8 @@ list(APPEND objects
     cal_mw_from_lambda.cpp
     template_helpers.cpp
     deltaspin_lcao.cpp
+    sc_parse_json.cpp
+    cal_mw_helper.cpp
 )
 
 add_library(
diff --git a/source/source_lcao/module_deltaspin/basic_funcs.cpp b/source/source_lcao/module_deltaspin/basic_funcs.cpp
index 343b2b37a73..83b101de641 100644
--- a/source/source_lcao/module_deltaspin/basic_funcs.cpp
+++ b/source/source_lcao/module_deltaspin/basic_funcs.cpp
@@ -57,7 +57,7 @@ void scalar_multiply_2d(const std::vector<ModuleBase::Vector3<double>>& array,
                         std::vector<ModuleBase::Vector3<double>>& result)
 {
     int size = array.size();
-    result.reserve(size);
+    result.resize(size);
     for (int i = 0; i < size; i++)
     {
         result[i] = scalar * array[i];
@@ -70,7 +70,7 @@ void add_scalar_multiply_2d(const std::vector<ModuleBase::Vector3<double>>& arra
                             std::vector<ModuleBase::Vector3<double>>& result)
 {
     int size = array_1.size();
-    result.reserve(size);
+    result.resize(size);
     for (int i = 0; i < size; i++)
     {
         result[i] = array_1[i] + scalar * array_2[i];
@@ -82,7 +82,7 @@ void subtract_2d(const std::vector<ModuleBase::Vector3<double>>& array_1,
                  std::vector<ModuleBase::Vector3<double>>& result)
 {
     int size = array_1.size();
-    result.reserve(size);
+    result.resize(size);
     for (int i = 0; i < size; i++)
     {
             result[i] = array_1[i] - array_2[i];
diff --git a/source/source_lcao/module_deltaspin/basic_funcs.h b/source/source_lcao/module_deltaspin/basic_funcs.h
index b1de060c4bb..6737e0431b4 100644
--- a/source/source_lcao/module_deltaspin/basic_funcs.h
+++ b/source/source_lcao/module_deltaspin/basic_funcs.h
@@ -2,36 +2,56 @@
 #define BASIC_FUNCS_H
 
 #include <cmath>
+#include <complex>
 #include <vector>
 #include <ostream>
 
 #include "source_base/vector3.h"
 
 /**
- * @brief Find the maximum absolute value in a 2D array.
+ * @file basic_funcs.h
+ * @brief Utility vector/array operations for per-atom 3D vector arrays.
+ *
+ * @par Data structure
+ * All functions operate on std::vector<ModuleBase::Vector3<double>>, which
+ * represents a 2D array of shape [nat][3] where:
+ * - First index: atom index (iat = 0 to nat-1)
+ * - Second index: component (x=0, y=1, z=2)
+ *
+ * These are NumPy-style element-wise operations used throughout the lambda
+ * optimization loop for manipulating magnetic moments, lambda values,
+ * search directions, and constraint masks.
+ */
+
+/**
+ * @brief Find the maximum absolute value across all atoms and components.
+ * @return max(|array[iat][ic]|) for all iat, ic
  */
 double maxval_abs_2d(const std::vector<ModuleBase::Vector3<double>>& array);
 
 /**
- * @brief Find the maximum absolute value in a 2D array and its index.
+ * @brief Find the maximum absolute value and its (atom, component) index.
+ * @return pair<iat, ic> of the element with maximum absolute value
  */
 std::pair<int,int> maxloc_abs_2d(const std::vector<ModuleBase::Vector3<double>>& array);
 
 /**
- * @brief sum of all elements in a 2D array.
+ * @brief Sum of all elements across all atoms and components.
+ * @tparam T Numeric type (int or double)
+ * @return sum(array[iat][ic]) for all iat, ic
  */
 template <typename T>
 T sum_2d(const std::vector<ModuleBase::Vector3<T>>& array);
 
 /**
- * @brief scalar multiply a 2D array.
+ * @brief Element-wise scalar multiplication: result = scalar * array.
  */
 void scalar_multiply_2d(const std::vector<ModuleBase::Vector3<double>>& array,
                         double scalar,
                         std::vector<ModuleBase::Vector3<double>>& result);
 
 /**
- * @brief array_1 + scalar * array_2.
+ * @brief Element-wise fused multiply-add: result = array_1 + scalar * array_2.
  */
 void add_scalar_multiply_2d(const std::vector<ModuleBase::Vector3<double>>& array_1,
                             const std::vector<ModuleBase::Vector3<double>>& array_2,
@@ -39,31 +59,50 @@ void add_scalar_multiply_2d(const std::vector<ModuleBase::Vector3<double>>& arra
                             std::vector<ModuleBase::Vector3<double>>& result);
 
 /**
- * @brief array_1 - array_2.
+ * @brief Element-wise subtraction: result = array_1 - array_2.
  */
 void subtract_2d(const std::vector<ModuleBase::Vector3<double>>& array_1,
                  const std::vector<ModuleBase::Vector3<double>>& array_2,
                  std::vector<ModuleBase::Vector3<double>>& result);
 
 /**
- * @brief fill a 2D array with a scalar.
+ * @brief Fill all elements with a scalar value.
  */
 void fill_scalar_2d(double scalar, std::vector<ModuleBase::Vector3<double>>& result);
 
 /**
- * @brief fill a 2D array with a scalar if the corresponding element is equal to mask.
+ * @brief Conditional fill: if mask[iat][ic] == value, set result[iat][ic] = scalar.
+ *
+ * Used to mask unconstrained components to zero:
+ *   where_fill_scalar_2d(constrain_, 0, 0.0, delta_spin)
+ * sets delta_spin[ia][ic] = 0 where constrain[ia][ic] == 0.
  */
 void where_fill_scalar_2d(const std::vector<ModuleBase::Vector3<int>>& array_mask,
                           int mask,
                           double scalar,
                           std::vector<ModuleBase::Vector3<double>>& result);
 
+/**
+ * @brief Conditional fill with else branch: if mask == value, set scalar; otherwise copy from rest.
+ *
+ * Used to create masked copies:
+ *   where_fill_scalar_else_2d(constrain_, 0, 0.0, lambda_, initial_lambda)
+ * sets initial_lambda[ia][ic] = 0 if unconstrained, else lambda_[ia][ic].
+ */
 void where_fill_scalar_else_2d(const std::vector<ModuleBase::Vector3<int>>& array_mask,
                                int mask,
                                double scalar,
                                const std::vector<ModuleBase::Vector3<double>>& rest,
                                std::vector<ModuleBase::Vector3<double>>& result);
 
+/**
+ * @brief Formatted print of a 2D array.
+ * @param info Header string
+ * @param array Data to print
+ * @param nspin Spin type: 2=z-only, 4=xyz
+ * @param unit_convert Multiplicative factor (e.g., Ry_to_eV for unit conversion)
+ * @param ofs Output stream (default: stdout)
+ */
 void print_2d(const std::string info, const std::vector<ModuleBase::Vector3<double>> &array, const int nspin, const double unit_convert = 1.0, std::ostream& ofs = std::cout);
 
-#endif // BASIC_FUNCS_H
\ No newline at end of file
+#endif // BASIC_FUNCS_H
diff --git a/source/source_lcao/module_deltaspin/cal_mw.cpp b/source/source_lcao/module_deltaspin/cal_mw.cpp
index 563f506f3b8..842d32c7187 100644
--- a/source/source_lcao/module_deltaspin/cal_mw.cpp
+++ b/source/source_lcao/module_deltaspin/cal_mw.cpp
@@ -13,23 +13,59 @@
 #include "source_lcao/hamilt_lcao.h"
 #include "source_lcao/module_operator_lcao/dspin_lcao.h"
 
+/**
+ * @file cal_mw.cpp
+ * @brief Magnetic moment calculation for LCAO and PW basis sets.
+ *
+ * @par cal_mi_lcao (LCAO)
+ * Uses the DeltaSpin operator to compute magnetic moments from the density
+ * matrix via real-space projection. For nspin=2, only the z-component is
+ * extracted. For nspin=4, all three components are extracted from the
+ * interleaved 4-component spinor density matrix.
+ *
+ * @par cal_mi_pw (PW)
+ * Uses the OnsiteProjector to compute atomic projections <alpha_{l,m}|psi_{k,i}>
+ * (becp coefficients), then decomposes these into magnetic moments using
+ * Pauli matrix traces (accumulate_Mi_from_becp).
+ *
+ * @par Error conditions
+ * - Dynamic cast failure: p_operator is not the correct DeltaSpin type.
+ *   This happens if set_operator() was not called with the correct type.
+ *   Solution: Ensure set_operator() is called before cal_mi_lcao().
+ */
+
+/**
+ * @brief Calculate atomic magnetic moments using real-space projection (LCAO basis).
+ *
+ * @details The DeltaSpin operator computes magnetic moments by projecting the
+ * density matrix onto atomic orbitals. For each constrained atom:
+ *   M_i = Tr[P_at * (rho_up - rho_dn)]  (nspin=2)
+ *   M_i = Tr[P_at * rho_spinor]          (nspin=4, decomposed via Pauli matrices)
+ *
+ * @param step Current SCF iteration number (for logging)
+ * @param print Whether to print moments (unused in this implementation)
+ */
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_lcao(const int& step, bool print)
 {
     ModuleBase::TITLE("module_deltaspin", "cal_mi_lcao");
     ModuleBase::timer::start("spinconstrain::SpinConstrain", "cal_mi_lcao");
-    // calculate MW from lambda in real space projection method
+    // Reset Mi before calculation
     this->zero_Mi();
     const hamilt::HContainer<double>* dmr = this->dm_->get_DMR_pointer(1);
     std::vector<double> moments;
-    if(PARAM.inp.nspin==2)
+    if(this->nspin_==2)
     {
+        // Switch to spin-difference density matrix (rho_up - rho_dn)
         this->dm_->switch_dmr(2);
 
+        // Compute moments via DeltaSpin operator
         moments = static_cast<hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, double>>*>(this->p_operator)->cal_moment(dmr, this->get_constrain());
 
+        // Switch back to total density matrix
         this->dm_->switch_dmr(0);
 
+        // For nspin=2, only z-component is meaningful
         for(int iat=0;iat<this->Mi_.size();iat++)
         {
             this->Mi_[iat].x = 0.0;
@@ -37,8 +73,9 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_lcao(const int&
             this->Mi_[iat].z = moments[iat];
         }
     }
-    else if(PARAM.inp.nspin==4)
+    else if(this->nspin_==4)
     {
+        // For nspin=4, moments array contains interleaved [Mx, My, Mz] per atom
         moments = static_cast<hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>*>(this->p_operator)->cal_moment(dmr, this->get_constrain());
         for(int iat=0;iat<this->Mi_.size();iat++)
         {
@@ -53,6 +90,21 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_lcao(const int&
 
 #endif
 
+/**
+ * @brief Calculate atomic magnetic moments using projector overlap (PW basis).
+ *
+ * @details For each k-point:
+ *   1. Tabulate atomic projectors: set up |alpha_{l,m}> for each atom
+ *   2. Compute becp = <alpha_{l,m}|psi_{k,i}> via overlap_proj_psi
+ *   3. Decompose becp into magnetic moments via accumulate_Mi_from_becp
+ *
+ * The magnetic moment is computed as:
+ *   Mi = sum_{k,i} w_{k,i} * <psi_{k,i}|P_at|sigma|psi_{k,i}>
+ * where P_at is the atomic projector and sigma are the Pauli matrices.
+ *
+ * Finally, Mi is summed across all MPI k-pool ranks since each pool only
+ * has a subset of k-points.
+ */
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_pw()
 {
@@ -63,7 +115,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_pw()
     if(PARAM.inp.device == "cpu")
     {
         auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
-        // loop over k-points to calculate Mi of \sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
+        // Loop over k-points to calculate Mi of sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
         std::complex<double>* psi_pointer = nullptr;
         psi::Psi<std::complex<double>, base_device::DEVICE_CPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_CPU>*>(this->psi);
         const int nbands = psi_t->get_nbands();
@@ -73,43 +125,18 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_pw()
         {
             psi_t->fix_k(ik);
             psi_pointer = psi_t->get_pointer();
-            onsite_p->tabulate_atomic(ik); // tabulate for each atom at each k-point
-            // std::cout << __FILE__ << ":" << __LINE__ << " nbands = " << nbands << std::endl;
-            onsite_p->overlap_proj_psi(nbands * npol, psi_pointer);
+            onsite_p->tabulate_atomic(ik); // Set up atomic projectors for this k-point
+            onsite_p->overlap_proj_psi(nbands * npol, psi_pointer); // Compute becp = <alpha|psi>
             const std::complex<double>* becp = onsite_p->get_h_becp();
-            // becp(nbands*npol , nkb)
-            // mag = wg * \sum_{nh}becp * becp
             int nkb = onsite_p->get_tot_nproj();
-            for(int ib = 0;ib<nbands;ib++)
-            {
-                const double weight = this->pelec->wg(ik, ib);
-                int begin_ih = 0;
-                for(int iat = 0; iat < this->Mi_.size(); iat++)
-                {
-                    std::complex<double> occ[4] = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO};
-                    const int nh = onsite_p->get_nh(iat);
-                    for(int ih = 0; ih < nh; ih++)
-                    {
-                        const int index = ib*2*nkb + begin_ih + ih;
-                        occ[0] += conj(becp[index]) * becp[index];
-                        occ[1] += conj(becp[index]) * becp[index + nkb];
-                        occ[2] += conj(becp[index + nkb]) * becp[index];
-                        occ[3] += conj(becp[index + nkb]) * becp[index + nkb];
-                    }
-                    // occ has been reduced and calculate mag
-                    this->Mi_[iat].z += weight * (occ[0] - occ[3]).real();
-                    this->Mi_[iat].x += weight * (occ[1] + occ[2]).real();
-                    this->Mi_[iat].y += weight * (occ[1] - occ[2]).imag();
-                    begin_ih += nh;
-                }
-            }
+            this->accumulate_Mi_from_becp(becp, nkb, nbands, npol, ik,
+                &this->pelec->wg(ik, 0), &onsite_p->get_nh(0));
         }
     }
 #if ((defined __CUDA) || (defined __ROCM))
     else
     {
         auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
-        // loop over k-points to calculate Mi of \sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
         std::complex<double>* psi_pointer = nullptr;
         psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
         const int nbands = psi_t->get_nbands();
@@ -119,45 +146,22 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_pw()
         {
             psi_t->fix_k(ik);
             psi_pointer = psi_t->get_pointer();
-            onsite_p->tabulate_atomic(ik); // tabulate for each atom at each k-point
-            // std::cout << __FILE__ << ":" << __LINE__ << " nbands = " << nbands << std::endl;
+            onsite_p->tabulate_atomic(ik);
             onsite_p->overlap_proj_psi(nbands * npol, psi_pointer);
             const std::complex<double>* becp = onsite_p->get_h_becp();
-            // becp(nbands*npol , nkb)
-            // mag = wg * \sum_{nh}becp * becp
             int nkb = onsite_p->get_size_becp() / nbands / npol;
-            for(int ib = 0;ib<nbands;ib++)
-            {
-                const double weight = this->pelec->wg(ik, ib);
-                int begin_ih = 0;
-                for(int iat = 0; iat < this->Mi_.size(); iat++)
-                {
-                    std::complex<double> occ[4] = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO};
-                    const int nh = onsite_p->get_nh(iat);
-                    for(int ih = 0; ih < nh; ih++)
-                    {
-                        const int index = ib*2*nkb + begin_ih + ih;
-                        occ[0] += conj(becp[index]) * becp[index];
-                        occ[1] += conj(becp[index]) * becp[index + nkb];
-                        occ[2] += conj(becp[index + nkb]) * becp[index];
-                        occ[3] += conj(becp[index + nkb]) * becp[index + nkb];
-                    }
-                    // occ has been reduced and calculate mag
-                    this->Mi_[iat].z += weight * (occ[0] - occ[3]).real();
-                    this->Mi_[iat].x += weight * (occ[1] + occ[2]).real();
-                    this->Mi_[iat].y += weight * (occ[1] - occ[2]).imag();
-                    begin_ih += nh;
-                }
-            }
+            this->accumulate_Mi_from_becp(becp, nkb, nbands, npol, ik,
+                &this->pelec->wg(ik, 0), &onsite_p->get_nh(0));
         }
     }
 #endif
-    // reduce mag from all k-pools
+    // MPI reduction: sum Mi across all k-pool ranks
     Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, GlobalV::NPROC_IN_POOL, &(this->Mi_[0][0]), 3 * this->Mi_.size());
-    
+
     ModuleBase::timer::end("spinconstrain::SpinConstrain", "cal_mi_pw");
 }
 
+/// @brief Set the DeltaSpin operator pointer for LCAO magnetic moment calculation
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::set_operator(
     hamilt::Operator<std::complex<double>>* op_in)
@@ -165,6 +169,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::set_operator(
     this->p_operator = op_in;
 }
 
+/// @brief Set the DeltaSpin operator pointer (double specialization)
 template <>
 void spinconstrain::SpinConstrain<double>::set_operator(
     hamilt::Operator<double>* op_in)
diff --git a/source/source_lcao/module_deltaspin/cal_mw_from_lambda.cpp b/source/source_lcao/module_deltaspin/cal_mw_from_lambda.cpp
index 92794fbee27..7b917575f4e 100644
--- a/source/source_lcao/module_deltaspin/cal_mw_from_lambda.cpp
+++ b/source/source_lcao/module_deltaspin/cal_mw_from_lambda.cpp
@@ -1,5 +1,6 @@
 #include "source_base/timer.h"
 #include "source_base/tool_title.h"
+#include "source_base/global_variable.h"
 #include "source_hsolver/diago_iter_assist.h"
 #include "source_io/module_parameter/parameter.h"
 #include "spin_constrain.h"
@@ -18,19 +19,92 @@
 #include "source_lcao/module_operator_lcao/dspin_lcao.h"
 #endif
 
+/**
+ * @file cal_mw_from_lambda.cpp
+ * @brief Core computational functions for DeltaSpin.
+ *
+ * @par calculate_delta_hcc
+ * Computes the DeltaSpin correction to the subspace Hamiltonian:
+ *   H_corrected = H_original + becp^† * delta_lambda * becp
+ *
+ * For npol=2 (non-collinear), the 2x2 Pauli matrix coefficients are:
+ *   coeff0 = (lambda_z, 0)        coeff1 = (lambda_x, lambda_y)
+ *   coeff2 = (lambda_x, -lambda_y) coeff3 = (-lambda_z, 0)
+ * Applied as: ps_up = coeff0 * becp_up + coeff2 * becp_dn
+ *             ps_dn = coeff1 * becp_up + coeff3 * becp_dn
+ *
+ * For npol=1 (collinear), only the z-component:
+ *   ps = lambda_z * spin_sign * becp
+ *
+ * @par update_psi_charge_pw_cpu/gpu
+ * Two-stage process for PW basis:
+ *   1. Subspace diagonalization: apply DeltaSpin correction, rotate psi
+ *   2. Full-space update: either run HSolverPW (pw_solve=true) or update weights (pw_solve=false)
+ *
+ * @par cal_mw_from_lambda
+ * The central workflow function called repeatedly during lambda optimization:
+ *   LCAO: update lambda in operator -> solve HSolverLCAO -> compute Mi
+ *   PW: save subspace data (first call) -> apply H correction -> diagonalize in subspace -> compute Mi from becp
+ *
+ * @par Error conditions
+ * - assert(sub_h_save != nullptr): cal_mw_from_lambda() must be called before
+ *   update_psi_charge_pw(). Failure means the workflow order is wrong.
+ *   Solution: Ensure cal_mw_from_lambda() is called at the start of each SCF step.
+ */
+
+/**
+ * @brief Compute DeltaSpin correction to the subspace Hamiltonian.
+ *
+ * @details Adds the constraint term to H in the projector subspace:
+ *   H += becp^† * ps, where ps = delta_lambda * becp
+ *
+ * For non-collinear (npol=2), this implements the full 2x2 Pauli matrix:
+ *   H_delta = | lambda_z     lambda_x + i*lambda_y |
+ *             | lambda_x - i*lambda_y   -lambda_z  |
+ *
+ * For collinear (npol=1), only the diagonal z-component with spin_sign:
+ *   H_delta = lambda_z * spin_sign
+ *
+ * @param h_tmp Subspace Hamiltonian (nbands x nbands, modified in place)
+ * @param becp_k Projector coefficients for k-point ik
+ * @param delta_lambda Lambda change per atom (or full lambda if full_update)
+ * @param nbands Number of bands
+ * @param nkb Total number of projectors
+ * @param nh_iat Number of projectors per atom
+ * @param ik K-point index (for spin_sign lookup in collinear mode)
+ * @param full_update If true, compute delta = lambda_current - lambda_at_save
+ */
 template <>
-void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std::complex<double>* h_tmp, const std::complex<double>* becp_k, const ModuleBase::Vector3<double>* delta_lambda, const int nbands, const int nkb, const int* nh_iat)
+void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std::complex<double>* h_tmp, const std::complex<double>* becp_k, const ModuleBase::Vector3<double>* delta_lambda, const int nbands, const int nkb, const int* nh_iat, const int ik, bool full_update)
 {
-    int sum = 0;
-    int size_ps = nkb * 2 * nbands;
+    ModuleBase::TITLE("spinconstrain::SpinConstrain", "calculate_delta_hcc");
+    ModuleBase::timer::start("spinconstrain::SpinConstrain", "calculate_delta_hcc");
+
+    // If full_update, compute actual delta = lambda_current - lambda_at_save
+    // This applies only the CHANGE in lambda, not the full lambda value
+    std::vector<ModuleBase::Vector3<double>> actual_delta;
+    const ModuleBase::Vector3<double>* effective_lambda = delta_lambda;
+    if (full_update)
+    {
+        int nat = this->get_nat();
+        actual_delta.resize(nat);
+        for (int iat = 0; iat < nat; iat++)
+        {
+            actual_delta[iat] = delta_lambda[iat] - this->lambda_in_sub_[iat];
+        }
+        effective_lambda = actual_delta.data();
+    }
+
+    int sum = 0; // Running sum of projectors across atoms
+    int size_ps = nkb * this->npol_ * nbands; // Total size of ps array
     std::complex<double>* becp_cpu = nullptr;
+
+    // Handle GPU/CPU memory for becp
     if(PARAM.inp.device == "gpu")
     {
 #if ((defined __CUDA) || (defined __ROCM))
-        base_device::DEVICE_GPU* ctx = {};
-        base_device::DEVICE_CPU* cpu_ctx = {};
         base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(becp_cpu, size_ps);
-        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(becp_cpu, becp_k, size_ps);   
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(becp_cpu, becp_k, size_ps);
 #endif
     }
     else if (PARAM.inp.device == "cpu")
@@ -38,54 +112,89 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
         becp_cpu = const_cast<std::complex<double>*>(becp_k);
     }
 
+    // Compute modified projector coefficients: ps = delta_lambda * becp
     std::vector<std::complex<double>> ps(size_ps, 0.0);
-    for (int iat = 0; iat < this->Mi_.size(); iat++)
+    if(this->npol_ == 2)
+    {
+        // =============================================================
+        // nspin=4 (non-collinear): full Pauli matrix treatment
+        // =============================================================
+        // For each atom, construct 2x2 coefficients:
+        //   | lambda_z      lambda_x + i*lambda_y |
+        //   | lambda_x - i*lambda_y   -lambda_z   |
+        // Then: ps_up = coeff0 * becp_up + coeff2 * becp_dn
+        //        ps_dn = coeff1 * becp_up + coeff3 * becp_dn
+        for (int iat = 0; iat < this->Mi_.size(); iat++)
+        {
+            const int nproj = nh_iat[iat];
+            const std::complex<double> coefficients0(effective_lambda[iat][2], 0.0);
+            const std::complex<double> coefficients1(effective_lambda[iat][0] , effective_lambda[iat][1]);
+            const std::complex<double> coefficients2(effective_lambda[iat][0] , -1 * effective_lambda[iat][1]);
+            const std::complex<double> coefficients3(-1 * effective_lambda[iat][2], 0.0);
+            for (int ib = 0; ib < nbands * this->npol_; ib += this->npol_)
+            {
+                for (int ip = 0; ip < nproj; ip++)
+                {
+                    const int becpind = ib * nkb + sum + ip;
+                    const std::complex<double> becp1 = becp_cpu[becpind];
+                    const std::complex<double> becp2 = becp_cpu[becpind + nkb];
+                    ps[becpind] += coefficients0 * becp1
+                                    + coefficients2 * becp2;
+                    ps[becpind + nkb] += coefficients1 * becp1
+                                        + coefficients3 * becp2;
+                }
+            }
+            sum += nproj;
+        }
+    }
+    else if(this->npol_ == 1)
     {
-        const int nproj = nh_iat[iat];
-        const std::complex<double> coefficients0(delta_lambda[iat][2], 0.0);
-        const std::complex<double> coefficients1(delta_lambda[iat][0] , delta_lambda[iat][1]);
-        const std::complex<double> coefficients2(delta_lambda[iat][0] , -1 * delta_lambda[iat][1]);
-        const std::complex<double> coefficients3(-1 * delta_lambda[iat][2], 0.0);
-        // each atom has nproj, means this is with structure factor;
-        // each projector (each atom) must multiply coefficient
-        // with all the other projectors.
-        for (int ib = 0; ib < nbands * 2; ib+=2)
+        // =============================================================
+        // nspin=2 (collinear): only z-component with spin_sign
+        // =============================================================
+        // ps = lambda_z * spin_sign * becp
+        // spin_sign = +1 for spin-up k-points, -1 for spin-down
+        for (int iat = 0; iat < this->Mi_.size(); iat++)
         {
-            for (int ip = 0; ip < nproj; ip++)
+            const int nproj = nh_iat[iat];
+            double coefficients0 = effective_lambda[iat][2] * this->get_spin_sign(ik);
+            for (int ib = 0; ib < nbands; ib++)
             {
-                const int becpind = ib * nkb + sum + ip;
-                const std::complex<double> becp1 = becp_cpu[becpind];
-                const std::complex<double> becp2 = becp_cpu[becpind + nkb];
-                ps[becpind] += coefficients0 * becp1
-                                + coefficients2 * becp2;
-                ps[becpind + nkb] += coefficients1 * becp1
-                                    + coefficients3 * becp2;
-            } // end ip
-        } // end ib
-        sum += nproj;
-    } // end iat
+                for (int ip = 0; ip < nproj; ip++)
+                {
+                    const int becpind = ib * nkb + sum + ip;
+                    const std::complex<double> becp1 = becp_cpu[becpind];
+                    ps[becpind] += coefficients0 * becp1;
+                }
+            }
+            sum += nproj;
+        }
+    }
+
+    // Copy ps to GPU if needed
     std::complex<double>* ps_pointer = nullptr;
     if(PARAM.inp.device == "gpu")
     {
 #if ((defined __CUDA) || (defined __ROCM))
-        base_device::DEVICE_GPU* ctx = {};
-        base_device::DEVICE_CPU* cpu_ctx = {};
         base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ps_pointer, size_ps);
-        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ps_pointer, ps.data(), size_ps);   
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ps_pointer, ps.data(), size_ps);
 #endif
     }
     else if (PARAM.inp.device == "cpu")
     {
         ps_pointer = ps.data();
     }
-    // update h_tmp by becp_k * ps
-    char transa = 'C';
-    char transb = 'N';
-    const int npm = nkb * 2;
+
+    // =============================================================
+    // H += becp^† * ps (GEMM: C = alpha * A^† * B + beta * C)
+    // A = becp_k (npm x nbands), B = ps (npm x nbands), C = h_tmp (nbands x nbands)
+    // =============================================================
+    char transa = 'C'; // Conjugate transpose of becp
+    char transb = 'N'; // Normal ps
+    const int npm = nkb * this->npol_;
     if (PARAM.inp.device == "gpu")
     {
 #if ((defined __CUDA) || (defined __ROCM))
-        base_device::DEVICE_GPU* ctx = {};
         ModuleBase::gemm_op<std::complex<double>, base_device::DEVICE_GPU>()(
             transa,
             transb,
@@ -102,13 +211,12 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
             nbands
         );
         base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ps_pointer);
-        delete[] becp_cpu;
+        base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(becp_cpu);
 #endif
 
     }
     else if (PARAM.inp.device == "cpu")
     {
-        base_device::DEVICE_CPU* ctx = {};
         ModuleBase::gemm_op<std::complex<double>, base_device::DEVICE_CPU>()(
             transa,
             transb,
@@ -125,36 +233,295 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
             nbands
         );
     }
+    ModuleBase::timer::end("spinconstrain::SpinConstrain", "calculate_delta_hcc");
+}
+
+/**
+ * @brief CPU implementation of PW wavefunction and charge density update.
+ *
+ * @par Two-stage process:
+ * Stage 1 - Subspace diagonalization:
+ *   For each k-point, apply DeltaSpin correction to the saved subspace H,
+ *   then diagonalize to rotate the wavefunctions. This is a cheap operation
+ *   in the reduced subspace (nbands x nbands).
+ *
+ * Stage 2 - Full-space update:
+ *   Option A (pw_solve=true): Run HSolverPW for iterative refinement in the
+ *     full plane-wave space. This is more accurate but expensive.
+ *   Option B (pw_solve=false): Update weights from new eigenvalues and call
+ *     psiToRho() to build the charge density from current psi. Faster but
+ *     may be less accurate if the subspace rotation was not sufficient.
+ *
+ * @par Memory management
+ * Frees sub_h_save, sub_s_save, becp_save after use. These are allocated
+ * on the first cal_mw_from_lambda() call and should only be freed here.
+ *
+ * @param delta_lambda Lambda change for incremental H correction
+ * @param pw_solve If true, run full PW solver; if false, just update weights
+ * @param full_update If true, apply full lambda (not delta) to H correction
+ */
+template <>
+void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge_pw_cpu(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve, bool full_update)
+{
+    ModuleBase::TITLE("spinconstrain::SpinConstrain", "update_psi_charge_pw_cpu");
+    ModuleBase::timer::start("spinconstrain::SpinConstrain", "update_psi_charge_pw_cpu");
+
+    psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
+    hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>*>(this->p_hamilt);
+    auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
+
+    int nbands = psi_t->get_nbands();
+    int npol = psi_t->get_npol();
+    int nkb = onsite_p->get_tot_nproj();
+    int nk = psi_t->get_nk();
+    int size_becp = nbands * nkb * npol;
+    const int* nh_iat = &onsite_p->get_nh(0);
+
+    std::vector<std::complex<double>> h_tmp(nbands * nbands), s_tmp(nbands * nbands);
+
+    // CRITICAL: subspace data must have been saved by cal_mw_from_lambda()
+    assert(this->sub_h_save != nullptr);
+    assert(this->sub_s_save != nullptr);
+    assert(this->becp_save != nullptr);
+
+    // Determine which lambda to use for H correction
+    const ModuleBase::Vector3<double>* lambda_for_hcc = delta_lambda;
+    std::vector<ModuleBase::Vector3<double>> computed_delta;
+    if (full_update)
+    {
+        lambda_for_hcc = this->lambda_.data();
+    }
+
+    // =============================================================
+    // STAGE 1: Subspace diagonalization for each k-point
+    // =============================================================
+    for (int ik = 0; ik < nk; ++ik)
+    {
+        std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
+        std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
+        std::complex<double>* becp_k = this->becp_save + ik * size_becp;
+
+        psi_t->fix_k(ik);
+
+        // Copy saved subspace matrices to temp
+        memcpy(h_tmp.data(), h_k, sizeof(std::complex<double>) * nbands * nbands);
+        memcpy(s_tmp.data(), s_k, sizeof(std::complex<double>) * nbands * nbands);
+
+        // Apply DeltaSpin correction: H += becp^† * lambda * becp
+        this->calculate_delta_hcc(h_tmp.data(), becp_k, lambda_for_hcc, nbands, nkb, nh_iat, ik, full_update);
+
+        // Diagonalize in subspace to update wavefunction coefficients and eigenvalues
+        hsolver::DiagoIterAssist<std::complex<double>>::diag_subspace_psi(h_tmp.data(),
+                                                                        s_tmp.data(),
+                                                                        nbands,
+                                                                        psi_t[0],
+                                                                        &this->pelec->ekb(ik, 0));
+    }
+
+    // Free saved subspace data (allocated in cal_mw_from_lambda)
+    delete[] this->sub_h_save;
+    delete[] this->sub_s_save;
+    delete[] this->becp_save;
+    this->sub_h_save = nullptr;
+    this->sub_s_save = nullptr;
+    this->becp_save = nullptr;
+
+    // =============================================================
+    // STAGE 2: Full-space update
+    // =============================================================
+    if (pw_solve)
+    {
+        // Full PW diagonalization: subspace rotation provides a good initial guess,
+        // then HSolverPW iteratively refines psi in the full plane-wave space and calls psiToRho.
+        hsolver::HSolverPW<std::complex<double>, base_device::DEVICE_CPU> hsolver_pw_obj(
+            this->pw_wfc_,
+            PARAM.inp.calculation,
+            PARAM.inp.basis_type,
+            PARAM.inp.ks_solver,
+            PARAM.globalv.use_uspp,
+            PARAM.inp.nspin,
+            hsolver::DiagoIterAssist<std::complex<double>>::SCF_ITER,
+            hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_NMAX,
+            hsolver::DiagoIterAssist<std::complex<double>>::PW_DIAG_THR,
+            hsolver::DiagoIterAssist<std::complex<double>>::need_subspace,
+            PARAM.inp.use_k_continuity);
+
+        hsolver_pw_obj.solve(hamilt_t, psi_t[0], this->pelec, this->pelec->ekb.c,
+            GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL, false, this->tpiba, this->get_nat());
+    }
+    else
+    {
+        // No full solver: update weights from new eigenvalues, then build rho from current psi
+        elecstate::calculate_weights(this->pelec->ekb,
+                                     this->pelec->wg,
+                                     this->pelec->klist,
+                                     this->pelec->eferm,
+                                     this->pelec->f_en,
+                                     this->pelec->nelec_spin,
+                                     this->pelec->skip_weights);
+        elecstate::calEBand(this->pelec->ekb, this->pelec->wg, this->pelec->f_en);
+        reinterpret_cast<elecstate::ElecStatePW<std::complex<double>, base_device::DEVICE_CPU>*>(this->pelec)->psiToRho(*psi_t);
+    }
+    ModuleBase::timer::end("spinconstrain::SpinConstrain", "update_psi_charge_pw_cpu");
 }
 
+#if ((defined __CUDA) || (defined __ROCM))
+/**
+ * @brief GPU implementation of PW wavefunction and charge density update.
+ *
+ * @details Same algorithm as update_psi_charge_pw_cpu(), but with GPU memory
+ * management (device allocation, host-device synchronization).
+ */
+template <>
+void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge_pw_gpu(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve, bool full_update)
+{
+    ModuleBase::TITLE("spinconstrain::SpinConstrain", "update_psi_charge_pw_gpu");
+    ModuleBase::timer::start("spinconstrain::SpinConstrain", "update_psi_charge_pw_gpu");
+
+    psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
+    hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>*>(this->p_hamilt);
+    auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
+
+    int nbands = psi_t->get_nbands();
+    int npol = psi_t->get_npol();
+    int nkb = onsite_p->get_tot_nproj();
+    int nk = psi_t->get_nk();
+    int size_becp = nbands * nkb * npol;
+    const int* nh_iat = &onsite_p->get_nh(0);
+
+    std::complex<double>* h_tmp = nullptr;
+    std::complex<double>* s_tmp = nullptr;
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(h_tmp, nbands * nbands);
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(s_tmp, nbands * nbands);
+
+    assert(this->sub_h_save != nullptr);
+    assert(this->sub_s_save != nullptr);
+    assert(this->becp_save != nullptr);
+
+    const ModuleBase::Vector3<double>* lambda_for_hcc = delta_lambda;
+    std::vector<ModuleBase::Vector3<double>> computed_delta;
+    if (full_update)
+    {
+        lambda_for_hcc = this->lambda_.data();
+    }
+
+    // STAGE 1: Subspace diagonalization for each k-point (GPU)
+    for (int ik = 0; ik < nk; ++ik)
+    {
+        std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
+        std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
+        std::complex<double>* becp_k = this->becp_save + ik * size_becp;
+
+        psi_t->fix_k(ik);
+
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(h_tmp, h_k, nbands * nbands);
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(s_tmp, s_k, nbands * nbands);
+
+        this->calculate_delta_hcc(h_tmp, becp_k, lambda_for_hcc, nbands, nkb, nh_iat, ik, full_update);
+
+        hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_subspace_psi(h_tmp,
+                                                                                s_tmp,
+                                                                                nbands,
+                                                                                psi_t[0],
+                                                                                &this->pelec->ekb(ik, 0));
+    }
+
+    // Free GPU memory for saved subspace data
+    base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(sub_h_save);
+    base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(sub_s_save);
+    base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(becp_save);
+    this->sub_h_save = nullptr;
+    this->sub_s_save = nullptr;
+    this->becp_save = nullptr;
+
+    // STAGE 2: Full-space update (GPU)
+    if (pw_solve)
+    {
+        hsolver::HSolverPW<std::complex<double>, base_device::DEVICE_GPU> hsolver_pw_obj(
+            this->pw_wfc_,
+            PARAM.inp.calculation,
+            PARAM.inp.basis_type,
+            PARAM.inp.ks_solver,
+            PARAM.globalv.use_uspp,
+            PARAM.inp.nspin,
+            hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::SCF_ITER,
+            hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::PW_DIAG_NMAX,
+            hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::PW_DIAG_THR,
+            hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::need_subspace,
+            PARAM.inp.use_k_continuity);
+
+        hsolver_pw_obj.solve(hamilt_t, psi_t[0], this->pelec, this->pelec->ekb.c,
+            GlobalV::RANK_IN_POOL, GlobalV::NPROC_IN_POOL, false, this->tpiba, this->get_nat());
+    }
+    else
+    {
+        elecstate::calculate_weights(this->pelec->ekb,
+                                     this->pelec->wg,
+                                     this->pelec->klist,
+                                     this->pelec->eferm,
+                                     this->pelec->f_en,
+                                     this->pelec->nelec_spin,
+                                     this->pelec->skip_weights);
+        elecstate::calEBand(this->pelec->ekb, this->pelec->wg, this->pelec->f_en);
+        reinterpret_cast<elecstate::ElecStatePW<std::complex<double>, base_device::DEVICE_GPU>*>(this->pelec)->psiToRho(*psi_t);
+    }
+    ModuleBase::timer::end("spinconstrain::SpinConstrain", "update_psi_charge_pw_gpu");
+}
+#endif
+
+/**
+ * @brief Core workflow: apply lambda -> solve Hamiltonian -> compute magnetic moments.
+ *
+ * @par LCAO path:
+ *   1. Update lambda in DeltaSpin operator (dspin->update_lambda())
+ *   2. Solve HSolverLCAO with charge update disabled (last param = true means no charge update)
+ *   3. Calculate weights from new eigenvalues
+ *   4. Call cal_mi_lcao() to compute moments from density matrix
+ *
+ * @par PW path:
+ *   1. [First call only, i_step==-1] Save subspace H, S, becp from Hamiltonian
+ *      This captures the "unperturbed" state before any lambda is applied.
+ *   2. [i_step!=-1] Apply DeltaSpin correction via calculate_delta_hcc()
+ *      For the first call (i_step==-1), no correction is applied (lambda=0).
+ *   3. Diagonalize in subspace via diag_responce(), update becp coefficients
+ *   4. Calculate weights from new eigenvalues
+ *   5. Call accumulate_Mi_from_becp() for each k-point to compute Mi
+ *   6. MPI reduce Mi across k-pools (each pool has a partial sum)
+ *
+ * @param i_step Current inner lambda step (-1 = initialization, 0+ = optimization)
+ * @param delta_lambda Change in lambda from previous step (unused in this function,
+ *                     the full lambda_ is used for H correction)
+ */
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
-		int i_step, 
+		int i_step,
 		const ModuleBase::Vector3<double>* delta_lambda)
 {
     ModuleBase::TITLE("spinconstrain::SpinConstrain", "cal_mw_from_lambda");
     ModuleBase::timer::start("spinconstrain::SpinConstrain", "cal_mw_from_lambda");
-    // lambda has been updated in the lambda loop
+
 #ifdef __LCAO
     if (PARAM.inp.basis_type == "lcao")
     {
+        // =============================================================
+        // LCAO PATH: Update lambda in operator, solve, compute Mi
+        // =============================================================
         psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
         hamilt::Hamilt<std::complex<double>>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt);
         hsolver::HSolverLCAO<std::complex<double>> hsolver_t(this->ParaV, PARAM.inp.ks_solver);
-        if (PARAM.inp.nspin == 2)
+        if (this->nspin_ == 2)
         {
             dynamic_cast<hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, double>>*>(this->p_operator)
                 ->update_lambda();
         }
-        else if (PARAM.inp.nspin == 4)
+        else if (this->nspin_ == 4)
         {
             dynamic_cast<hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>*>(
                 this->p_operator)
                 ->update_lambda();
         }
-        // diagonalization without update charge
-        // mohan add two parameters charge and nspin, 2025-10-24
-        hsolver_t.solve(hamilt_t, psi_t[0], this->pelec, *this->dm_, *this->pelec->charge, PARAM.inp.nspin, true);
+        // Diagonalization without updating charge density (last param = true means skip charge update)
+        hsolver_t.solve(hamilt_t, psi_t[0], this->pelec, *this->dm_, *this->pelec->charge, this->nspin_, true);
         elecstate::calculate_weights(this->pelec->ekb,
                                      this->pelec->wg,
                                      this->pelec->klist,
@@ -164,36 +531,11 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
                                      this->pelec->skip_weights);
         elecstate::calEBand(this->pelec->ekb,this->pelec->wg,this->pelec->f_en);
 
-        elecstate::cal_dm_psi(this->ParaV, this->pelec->wg, *psi_t, *this->dm_);
-
-        this->dm_->cal_DMR();
-
         this->cal_mi_lcao(i_step);
     }
     else
 #endif
     {
-        /*if (i_step == -1 && this->higher_mag_prec)
-        {
-            // std::cout<<__FILE__<<__LINE__<<"istep == 0"<<std::endl;
-            if (PARAM.inp.device == "cpu")
-            {
-                psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
-                hamilt::Hamilt<std::complex<double>>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt);
-                hsolver::HSolver<std::complex<double>, base_device::DEVICE_CPU>* hsolver_t = static_cast<hsolver::HSolver<std::complex<double>, base_device::DEVICE_CPU>*>(this->phsol);
-                hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, true);
-            }
-            else
-            {
-                psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
-                hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>*>(this->p_hamilt);
-                hsolver::HSolver<std::complex<double>, base_device::DEVICE_GPU>* hsolver_t = static_cast<hsolver::HSolver<std::complex<double>, base_device::DEVICE_GPU>*>(this->phsol);
-                hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, true);
-            }
-            this->pelec->calculate_weights();
-            this->cal_Mi_pw();
-        }
-        else*/
         {
             this->zero_Mi();
             int size_becp = 0;
@@ -205,6 +547,9 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
             const int* nh_iat = nullptr;
             if (PARAM.inp.device == "cpu")
             {
+                // =============================================================
+                // PW PATH (CPU): Subspace diagonalization + Mi from becp
+                // =============================================================
                 psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
                 hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>*>(this->p_hamilt);
                 auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
@@ -219,10 +564,12 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
                 int initial_hs = 0;
                 if(this->sub_h_save == nullptr)
                 {
+                    // FIRST CALL: save subspace data for reuse across lambda steps
                     initial_hs = 1;
                     this->sub_h_save = new std::complex<double>[nbands * nbands * nk];
                     this->sub_s_save = new std::complex<double>[nbands * nbands * nk];
                     this->becp_save = new std::complex<double>[size_becp * nk];
+                    this->lambda_in_sub_ = this->lambda_;
                 }
                 for (int ik = 0; ik < nk; ++ik)
                 {
@@ -234,30 +581,32 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
                     std::complex<double>* becp_k = this->becp_save + ik * size_becp;
                     if(initial_hs)
                     {
-                        /// update H(k) for each k point
+                        /// Compute H(k) and extract subspace matrices for this k-point
                         hamilt_t->updateHk(ik);
                         hsolver::DiagoIterAssist<std::complex<double>>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k);
                         memcpy(becp_k, onsite_p->get_becp(), sizeof(std::complex<double>) * size_becp);
                     }
                     memcpy(h_tmp.data(), h_k, sizeof(std::complex<double>) * nbands * nbands);
                     memcpy(s_tmp.data(), s_k, sizeof(std::complex<double>) * nbands * nbands);
-                    // update h_tmp by delta_lambda
-                    if (i_step != -1) this->calculate_delta_hcc(h_tmp.data(), becp_k, delta_lambda, nbands, nkb, nh_iat);
+                    // Apply DeltaSpin correction (skip for initialization step i_step=-1)
+                    if (i_step != -1) this->calculate_delta_hcc(h_tmp.data(), becp_k, this->lambda_.data(), nbands, nkb, nh_iat, ik, true);
 
+                    // Diagonalize in subspace, update becp (response wavefunctions)
                     hsolver::DiagoIterAssist<std::complex<double>>::diag_responce(h_tmp.data(),
                                                                                   s_tmp.data(),
                                                                                   nbands,
                                                                                   becp_k,
                                                                                   &becp_tmp[ik * size_becp],
-                                                                                  nkb * 2,
+                                                                                  nkb * npol,
                                                                                   &this->pelec->ekb(ik, 0));
                 }
             }
 #if ((defined __CUDA) || (defined __ROCM))
             else
             {
-                base_device::DEVICE_GPU* ctx = {};
-                base_device::DEVICE_CPU* cpu_ctx = {};
+                // =============================================================
+                // PW PATH (GPU): Same as CPU but with GPU memory management
+                // =============================================================
                 psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
                 hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>*>(this->p_hamilt);
                 auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
@@ -276,13 +625,12 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
                 if(this->sub_h_save == nullptr)
                 {
                     initial_hs = 1;
-                    
                     base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(this->sub_h_save, nbands * nbands * nk);
                     base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(this->sub_s_save, nbands * nbands * nk);
                     base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(this->becp_save, size_becp * nk);
+                    this->lambda_in_sub_ = this->lambda_;
                 }
                 std::complex<double>* becp_pointer = nullptr;
-                // allocate memory for becp_pointer in GPU device
                 base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(becp_pointer, size_becp);
                 for (int ik = 0; ik < nk; ++ik)
                 {
@@ -293,15 +641,13 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
                     std::complex<double>* becp_k = this->becp_save + ik * size_becp;
                     if(initial_hs)
                     {
-                        /// update H(k) for each k point
                         hamilt_t->updateHk(ik);
                         hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k);
                         base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(becp_k, onsite_p->get_becp(), size_becp);
                     }
                     base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(h_tmp, h_k, nbands * nbands);
                     base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(s_tmp, s_k, nbands * nbands);
-                    // update h_tmp by delta_lambda
-                    if (i_step != -1) this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat);
+                    if (i_step != -1) this->calculate_delta_hcc(h_tmp, becp_k, this->lambda_.data(), nbands, nkb, nh_iat, ik, true);
 
                     hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_responce(h_tmp,
                                                                                   s_tmp,
@@ -310,15 +656,14 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
                                                                                   becp_pointer,
                                                                                   nkb * npol,
                                                                                   &this->pelec->ekb(ik, 0));
-                    // copy becp_pointer from GPU to CPU
-                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(&becp_tmp[ik * size_becp], becp_pointer, size_becp);   
+                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(&becp_tmp[ik * size_becp], becp_pointer, size_becp);
                 }
 
-                // free memory for becp_pointer in GPU device
                 base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(becp_pointer);
             }
 #endif
-            // calculate weights from ekb to update wg
+
+            // Calculate weights from eigenvalues to update occupation
             elecstate::calculate_weights(this->pelec->ekb,
                                          this->pelec->wg,
                                          this->pelec->klist,
@@ -326,53 +671,33 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(
                                          this->pelec->f_en,
                                          this->pelec->nelec_spin,
                                          this->pelec->skip_weights);
-            // calculate Mi from existed becp
+            // Calculate Mi from becp coefficients for each k-point
             for (int ik = 0; ik < nk; ik++)
             {
                 const std::complex<double>* becp = &becp_tmp[ik * size_becp];
-                // becp(nbands*npol , nkb)
-                // mag = wg * \sum_{nh}becp * becp
-                for (int ib = 0; ib < nbands; ib++)
-                {
-                    const double weight = this->pelec->wg(ik, ib);
-                    int begin_ih = 0;
-                    for (int iat = 0; iat < this->Mi_.size(); iat++)
-                    {
-                        const int nh = nh_iat[iat];
-                        std::complex<double> occ[4]
-                            = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO};
-                        for (int ih = 0; ih < nh; ih++)
-                        {
-                            const int index = ib * npol * nkb + begin_ih + ih;
-                            occ[0] += conj(becp[index]) * becp[index];
-                            occ[1] += conj(becp[index]) * becp[index + nkb];
-                            occ[2] += conj(becp[index + nkb]) * becp[index];
-                            occ[3] += conj(becp[index + nkb]) * becp[index + nkb];
-                        }
-                        // occ has been reduced and calculate mag
-                        this->Mi_[iat].x += weight * (occ[1] + occ[2]).real();
-                        this->Mi_[iat].y += weight * (occ[1] - occ[2]).imag();
-                        this->Mi_[iat].z += weight * (occ[0] - occ[3]).real();
-                        begin_ih += nh;
-                    }
-                }
+                this->accumulate_Mi_from_becp(becp, nkb, nbands, this->npol_, ik,
+                    &this->pelec->wg(ik, 0), nh_iat);
             }
-            Parallel_Reduce::reduce_double_allpool(GlobalV::KPAR,
-                                                   GlobalV::NPROC_IN_POOL,
-                                                   &(this->Mi_[0][0]),
-                                                   3 * this->Mi_.size());
-            // for(int i = 0; i < this->Mi_.size(); i++)
-            //{
-            //     std::cout<<"atom"<<i<<": "<<" mag: "<<this->Mi_[i].x<<" "<<this->Mi_[i].y<<" "<<this->Mi_[i].z<<"
-            //     "<<this->lambda_[i].x<<" "<<this->lambda_[i].y<<" "<<this->lambda_[i].z<<std::endl;
-            // }
+            // MPI reduction: sum Mi across all k-pool ranks
+            Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar,
+                                                    GlobalV::NPROC_IN_POOL,
+                                                    &(this->Mi_[0][0]),
+                                                    3 * this->Mi_.size());
         }
     }
     ModuleBase::timer::end("spinconstrain::SpinConstrain", "cal_mw_from_lambda");
 }
 
+/**
+ * @brief Dispatcher: route to LCAO or PW (CPU/GPU) wavefunction/charge update.
+ *
+ * @details For LCAO: simply calls psiToRho() since the Hamiltonian already
+ * includes the DeltaSpin correction.
+ * For PW: calls update_psi_charge_pw_cpu or update_psi_charge_pw_gpu
+ * which perform subspace diagonalization and optional full-space refinement.
+ */
 template <>
-void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve)
+void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve, bool full_update)
 {
     ModuleBase::TITLE("spinconstrain::SpinConstrain", "update_psi_charge");
     ModuleBase::timer::start("spinconstrain::SpinConstrain", "update_psi_charge");
@@ -385,159 +710,16 @@ void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const
     else
 #endif
     {
-        int size_becp = 0;
-        std::vector<std::complex<double>> becp_tmp;
-        int nk = 0;
-        int nkb = 0;
-        int nbands = 0;
-        int npol = 0;
-        const int* nh_iat = nullptr;
         if (PARAM.inp.device == "cpu")
         {
-            psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
-            hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>*>(this->p_hamilt);
-            auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
-            nbands = psi_t->get_nbands();
-            npol = psi_t->get_npol();
-            nkb = onsite_p->get_tot_nproj();
-            nk = psi_t->get_nk();
-            nh_iat = &onsite_p->get_nh(0);
-            size_becp = nbands * nkb * npol;
-            becp_tmp.resize(size_becp * nk);
-            std::vector<std::complex<double>> h_tmp(nbands * nbands), s_tmp(nbands * nbands);
-            assert(this->sub_h_save != nullptr);
-            assert(this->sub_s_save != nullptr);
-            assert(this->becp_save != nullptr);
-            for (int ik = 0; ik < nk; ++ik)
-            {
-                std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
-                std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
-                std::complex<double>* becp_k = this->becp_save + ik * size_becp;
-
-                psi_t->fix_k(ik);
-                memcpy(h_tmp.data(), h_k, sizeof(std::complex<double>) * nbands * nbands);
-                memcpy(s_tmp.data(), s_k, sizeof(std::complex<double>) * nbands * nbands);
-                this->calculate_delta_hcc(h_tmp.data(), becp_k, delta_lambda, nbands, nkb, nh_iat);
-                hsolver::DiagoIterAssist<std::complex<double>>::diag_subspace_psi(h_tmp.data(),
-                                                                                s_tmp.data(),
-                                                                                nbands,
-                                                                                psi_t[0],
-                                                                                &this->pelec->ekb(ik, 0));
-            }
-
-            delete[] this->sub_h_save;
-            delete[] this->sub_s_save;
-            delete[] this->becp_save;
-            this->sub_h_save = nullptr;
-            this->sub_s_save = nullptr;
-            this->becp_save = nullptr;
-
-            if(pw_solve)
-            {
-				hsolver::HSolverPW<std::complex<double>, base_device::DEVICE_CPU> hsolver_pw_obj(this->pw_wfc_,
-						PARAM.inp.calculation,
-						PARAM.inp.basis_type,
-						PARAM.inp.ks_solver,
-						false,
-						PARAM.globalv.use_uspp,
-						PARAM.inp.nspin,
-						hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::SCF_ITER,
-						hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::PW_DIAG_NMAX,
-						hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::PW_DIAG_THR,
-						hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::need_subspace);
-
-				hsolver_pw_obj.solve(hamilt_t,
-						psi_t[0],
-						this->pelec,
-						this->pelec->ekb.c,
-						GlobalV::RANK_IN_POOL,
-						GlobalV::NPROC_IN_POOL,
-						false,
-						this->tpiba,
-						this->get_nat());
-            }
-            else
-            {// update charge density only
-                this->pelec->psiToRho(*psi_t);
-            }
+            this->update_psi_charge_pw_cpu(delta_lambda, pw_solve, full_update);
         }
 #if ((defined __CUDA) || (defined __ROCM))
         else
         {
-			base_device::DEVICE_GPU* ctx = {};
-			base_device::DEVICE_CPU* cpu_ctx = {};
-			psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
-			hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>*>(this->p_hamilt);
-			auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
-			nbands = psi_t->get_nbands();
-			npol = psi_t->get_npol();
-			nkb = onsite_p->get_tot_nproj();
-			nk = psi_t->get_nk();
-			nh_iat = &onsite_p->get_nh(0);
-			size_becp = nbands * nkb * npol;
-
-            std::complex<double>* h_tmp = nullptr;
-            std::complex<double>* s_tmp = nullptr;
-            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(h_tmp, nbands * nbands);
-            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(s_tmp, nbands * nbands);
-            assert(this->sub_h_save != nullptr);
-            assert(this->sub_s_save != nullptr);
-            assert(this->becp_save != nullptr);
-            for (int ik = 0; ik < nk; ++ik)
-            {
-                std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
-                std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
-                std::complex<double>* becp_k = this->becp_save + ik * size_becp;
-
-                psi_t->fix_k(ik);
-                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(h_tmp, h_k, nbands * nbands);
-                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(s_tmp, s_k, nbands * nbands);
-                this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat);
-                hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_subspace_psi(h_tmp,
-                                                                                s_tmp,
-                                                                                nbands,
-                                                                                psi_t[0],
-                                                                                &this->pelec->ekb(ik, 0));
-            }
-
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(sub_h_save);
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(sub_s_save);
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(becp_save);
-            this->sub_h_save = nullptr;
-            this->sub_s_save = nullptr;
-            this->becp_save = nullptr;
-
-            if(pw_solve)
-            {
-                hsolver::HSolverPW<std::complex<double>, base_device::DEVICE_GPU> hsolver_pw_obj(this->pw_wfc_,
-                                                 PARAM.inp.calculation,
-                                                 PARAM.inp.basis_type,
-                                                 PARAM.inp.ks_solver,
-                                                 false,
-                                                 PARAM.globalv.use_uspp,
-                                                 PARAM.inp.nspin,
-                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::SCF_ITER,
-                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::PW_DIAG_NMAX,
-                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::PW_DIAG_THR,
-                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::need_subspace);
-
-                hsolver_pw_obj.solve(hamilt_t,
-                         psi_t[0],
-                         this->pelec,
-                         this->pelec->ekb.c,
-                         GlobalV::RANK_IN_POOL,
-                         GlobalV::NPROC_IN_POOL,
-                         false,
-                         this->tpiba,
-                         this->get_nat());
-            }
-            else
-            {// update charge density only
-                reinterpret_cast<elecstate::ElecStatePW<std::complex<double>, base_device::DEVICE_GPU>*>(this->pelec)->psiToRho(*psi_t);
-            }
-            
+            this->update_psi_charge_pw_gpu(delta_lambda, pw_solve, full_update);
         }
-#endif       
+#endif
     }
     ModuleBase::timer::end("spinconstrain::SpinConstrain", "update_psi_charge");
 }
diff --git a/source/source_lcao/module_deltaspin/cal_mw_helper.cpp b/source/source_lcao/module_deltaspin/cal_mw_helper.cpp
new file mode 100644
index 00000000000..8e7d6e0e89b
--- /dev/null
+++ b/source/source_lcao/module_deltaspin/cal_mw_helper.cpp
@@ -0,0 +1,235 @@
+#ifdef __LCAO
+#include "spin_constrain.h"
+
+/**
+ * @file cal_mw_helper.cpp
+ * @brief LCAO-specific helper functions for magnetic moment calculation from orbital matrices.
+ *
+ * @par Purpose
+ * Provides alternative paths for computing magnetic moments from the orbital
+ * multiplication matrix (orbMulP) and the mu*density matrix (mud). These are
+ * used when the DeltaSpin operator path is not available or for debugging.
+ *
+ * @par Data flow
+ * 1. convert(): Flatten orbMulP into nested vector [nspin][iat][iw]
+ * 2. calculate_MW(): Sum orbital contributions per atom, compute Mi
+ * 3. collect_MW(): Accumulate mu*dm contributions into MecMulP matrix
+ */
+
+/**
+ * @brief Convert flat orbital matrix to nested vector format.
+ *
+ * @details The orbMulP matrix stores orbital contributions in a flat layout:
+ *   orbMulP(is, num) where num runs through all orbitals of all atoms.
+ * This function reorganizes it into a nested structure:
+ *   AorbMulP[is][iat][iw] = orbMulP(is, num)
+ *
+ * Values below 1e-10 are set to 0.0 to avoid floating-point noise.
+ *
+ * @param orbMulP Flat matrix of orbital contributions [nspin x ntotal_orbitals]
+ * @return Nested vector [nspin][iat][iw]
+ */
+template <>
+std::vector<std::vector<std::vector<double>>> spinconstrain::SpinConstrain<std::complex<double>>::convert(
+    const ModuleBase::matrix& orbMulP)
+{
+    std::vector<std::vector<std::vector<double>>> AorbMulP;
+    AorbMulP.resize(this->nspin_);
+    int nat = this->get_nat();
+    for (int is = 0; is < this->nspin_; ++is)
+    {
+        int num = 0;
+        AorbMulP[is].resize(nat);
+        for (const auto& sc_elem: this->get_atomCounts())
+        {
+            int it = sc_elem.first;
+            int nat_it = sc_elem.second;
+            int nw_it = this->get_orbitalCounts().at(it);
+            for (int ia = 0; ia < nat_it; ia++)
+            {
+                int iat = this->get_iat(it, ia);
+                AorbMulP[is][iat].resize(nw_it, 0.0);
+                for (int iw = 0; iw < nw_it; iw++)
+                {
+                    AorbMulP[is][iat][iw] = std::abs(orbMulP(is, num))< 1e-10 ? 0.0 : orbMulP(is, num);
+                    num++;
+                }
+            }
+        }
+    }
+    return AorbMulP;
+}
+
+/**
+ * @brief Calculate magnetic moments from converted orbital matrix.
+ *
+ * @par Algorithm (nspin=2):
+ *   atom_mag = sum(orbMulP[0][iat]) - sum(orbMulP[1][iat])
+ *   Mi[iat].z = atom_mag (z-component only)
+ *
+ * @par Algorithm (nspin=4):
+ * The 4 spinor components are mapped to magnetic moments:
+ *   total_charge_soc[0] = Tr(rho * I) / 2      (charge)
+ *   total_charge_soc[1] = Tr(rho * sigma_x)    (Mx)
+ *   total_charge_soc[2] = Tr(rho * sigma_y)    (My)
+ *   total_charge_soc[3] = Tr(rho * sigma_z)    (Mz)
+ * Components below sc_thr_ are set to 0.0 to avoid noise.
+ *
+ * @param AorbMulP Nested vector [nspin][iat][iw] from convert()
+ */
+template <>
+void spinconstrain::SpinConstrain<std::complex<double>>::calculate_MW(
+    const std::vector<std::vector<std::vector<double>>>& AorbMulP)
+{
+    size_t nw = this->get_nw();
+    int nat = this->get_nat();
+
+    this->zero_Mi();
+
+    const int nlocal = (this->nspin_ == 4) ? nw / 2 : nw;
+    for (const auto& sc_elem: this->get_atomCounts())
+    {
+        int it = sc_elem.first;
+        int nat_it = sc_elem.second;
+        for (int ia = 0; ia < nat_it; ia++)
+        {
+            int num = 0;
+            int iat = this->get_iat(it, ia);
+            double atom_mag = 0.0;
+            std::vector<double> total_charge_soc(this->nspin_, 0.0);
+            for (const auto& lnchi: this->get_lnchiCounts().at(it))
+            {
+                std::vector<double> sum_l(this->nspin_, 0.0);
+                int L = lnchi.first;
+                int nchi = lnchi.second;
+                for (int Z = 0; Z < nchi; ++Z)
+                {
+                    std::vector<double> sum_m(this->nspin_, 0.0);
+                    for (int M = 0; M < (2 * L + 1); ++M)
+                    {
+                        for (int j = 0; j < this->nspin_; j++)
+                        {
+                            sum_m[j] += AorbMulP[j][iat][num];
+                        }
+                        num++;
+                    }
+                    for (int j = 0; j < this->nspin_; j++)
+                    {
+                        sum_l[j] += sum_m[j];
+                    }
+                }
+                if (this->nspin_ == 2)
+                {
+                    atom_mag += sum_l[0] - sum_l[1];
+                }
+                else if (this->nspin_ == 4)
+                {
+                    for (int j = 0; j < this->nspin_; j++)
+                    {
+                        total_charge_soc[j] += sum_l[j];
+                    }
+                }
+            }
+            if (this->nspin_ == 2)
+            {
+                this->Mi_[iat].x = 0.0;
+                this->Mi_[iat].y = 0.0;
+                this->Mi_[iat].z = atom_mag;
+            }
+            else if (this->nspin_ == 4)
+            {
+                this->Mi_[iat].x = (std::abs(total_charge_soc[1]) < this->sc_thr_)? 0.0 : total_charge_soc[1];
+                this->Mi_[iat].y = (std::abs(total_charge_soc[2]) < this->sc_thr_)? 0.0 : total_charge_soc[2];
+                this->Mi_[iat].z = (std::abs(total_charge_soc[3]) < this->sc_thr_)? 0.0 : total_charge_soc[3];
+            }
+        }
+    }
+}
+
+/**
+ * @brief Accumulate magnetic moment contributions from mu*density matrix.
+ *
+ * @details For distributed matrices (ScaLAPACK), only the local processor's
+ * elements are accumulated. The ParaV mapping converts global indices to
+ * local row/column indices.
+ *
+ * @par nspin=4 spinor decomposition
+ * The mud matrix stores the 2x2 spinor blocks interleaved:
+ *   Global index 2j -> spin-up component
+ *   Global index 2j+1 -> spin-down component
+ * The Pauli matrix traces are:
+ *   M0 (charge): mud(k1,k1).real + mud(k2,k2).real
+ *   M3 (Mz):     mud(k1,k1).real - mud(k2,k2).real
+ *   M1 (Mx):     mud(k1,k2).real + mud(k2,k1).real
+ *   M2 (My):    -mud(k1,k2).imag + mud(k2,k1).imag
+ *
+ * @param MecMulP Output matrix [4 x nw/2]: MecMulP[0]=charge, [1]=Mx, [2]=My, [3]=Mz
+ * @param mud Input mu*density matrix (column-major)
+ * @param nw Total number of orbitals
+ * @param isk Spin index (0 or 1 for nspin=2)
+ */
+template <>
+void spinconstrain::SpinConstrain<std::complex<double>>::collect_MW(ModuleBase::matrix& MecMulP,
+                                                      const ModuleBase::ComplexMatrix& mud,
+                                                      int nw,
+                                                      int isk)
+{
+    if (this->nspin_ == 2)
+    {
+        for (size_t i=0; i < nw; ++i)
+        {
+            if (this->ParaV->in_this_processor(i, i))
+            {
+                const int ir = this->ParaV->global2local_row(i);
+                const int ic = this->ParaV->global2local_col(i);
+                MecMulP(isk, i) += mud(ic, ir).real();
+            }
+        }
+    }
+    else if (this->nspin_ == 4)
+    {
+        for (size_t i = 0; i < nw; ++i)
+        {
+            const int index = i % 2;
+            if (!index)
+            {
+                const int j = i / 2;
+                const int k1 = 2 * j;
+                const int k2 = 2 * j + 1;
+                if (this->ParaV->in_this_processor(k1, k1))
+                {
+                    const int ir = this->ParaV->global2local_row(k1);
+                    const int ic = this->ParaV->global2local_col(k1);
+                    MecMulP(0, j) += mud(ic, ir).real();
+                    MecMulP(3, j) += mud(ic, ir).real();
+                }
+                if (this->ParaV->in_this_processor(k1, k2))
+                {
+                    const int ir = this->ParaV->global2local_row(k1);
+                    const int ic = this->ParaV->global2local_col(k2);
+                    // note that mud is column major
+                    MecMulP(1, j) += mud(ic, ir).real();
+                    // M_y = i(M_{up,down} - M_{down,up}) = -(M_{up,down} - M_{down,up}).imag()
+                    MecMulP(2, j) -= mud(ic, ir).imag();
+                }
+                if (this->ParaV->in_this_processor(k2, k1))
+                {
+                    const int ir = this->ParaV->global2local_row(k2);
+                    const int ic = this->ParaV->global2local_col(k1);
+                    MecMulP(1, j) += mud(ic, ir).real();
+                    // M_y = i(M_{up,down} - M_{down,up}) = -(M_{up,down} - M_{down,up}).imag()
+                    MecMulP(2, j) += mud(ic, ir).imag();
+                }
+                if (this->ParaV->in_this_processor(k2, k2))
+                {
+                    const int ir = this->ParaV->global2local_row(k2);
+                    const int ic = this->ParaV->global2local_col(k2);
+                    MecMulP(0, j) += mud(ic, ir).real();
+                    MecMulP(3, j) -= mud(ic, ir).real();
+                }
+            }
+        }
+    }
+}
+
+#endif
diff --git a/source/source_lcao/module_deltaspin/deltaspin_lcao.cpp b/source/source_lcao/module_deltaspin/deltaspin_lcao.cpp
index 6a7effb6d02..811d6fef193 100644
--- a/source/source_lcao/module_deltaspin/deltaspin_lcao.cpp
+++ b/source/source_lcao/module_deltaspin/deltaspin_lcao.cpp
@@ -5,9 +5,42 @@
 #include "source_estate/module_dm/density_matrix.h"
 #include "source_estate/elecstate.h"
 
+/**
+ * @file deltaspin_lcao.cpp
+ * @brief Wrapper/facade layer between ESolver and DeltaSpin module.
+ *
+ * @par Purpose
+ * Provides a simplified interface to the ESolver layer, hiding the
+ * SpinConstrain Singleton details. The ESolver calls these functions
+ * rather than accessing SpinConstrain directly.
+ *
+ * @par Design rationale
+ * - Template functions: Support both TK=double (nspin=2) and TK=complex<double> (nspin=4)
+ * - Early returns: If sc_mag_switch is false, all functions return immediately
+ *   without any overhead
+ * - #ifdef __LCAO: The density matrix pointer is only available in LCAO builds
+ *
+ * @par Workflow
+ * 1. ESolver calls init_deltaspin_lcao() at start of calculation
+ * 2. Each SCF iteration:
+ *    a. ESolver calls cal_mi_lcao_wrapper() to compute magnetic moments
+ *    b. ESolver calls run_deltaspin_lambda_loop_lcao() to optimize lambda
+ *    c. If skip_solve=true, ESolver skips the Hamiltonian solve (lambda loop already did it)
+ */
+
 namespace ModuleESolver
 {
 
+/**
+ * @brief Initialize the SpinConstrain singleton with all input parameters.
+ *
+ * @details Called once at the start of a DeltaSpin calculation. Checks
+ * sc_mag_switch first; if disabled, returns immediately without any action.
+ *
+ * @par Conditional compilation
+ * The density matrix pointer (dm) is only available when __LCAO is defined.
+ * For non-LCAO builds (PW-only), init_sc() is called without the dm parameter.
+ */
 template <typename TK>
 void init_deltaspin_lcao(const UnitCell& ucell,
                           const Input_para& inp,
@@ -18,28 +51,37 @@ void init_deltaspin_lcao(const UnitCell& ucell,
                           void* dm,
                           void* pelec)
 {
+    // Early exit if DeltaSpin is not enabled
     if (!inp.sc_mag_switch)
     {
         return;
     }
-    
+
     spinconstrain::SpinConstrain<TK>& sc = spinconstrain::SpinConstrain<TK>::getScInstance();
 #ifdef __LCAO
+    // LCAO build: pass density matrix pointer
     sc.init_sc(inp.sc_thr, inp.nsc, inp.nsc_min, inp.alpha_trial,
-               inp.sccut, inp.sc_drop_thr, ucell,
+               inp.sccut, inp.sc_drop_thr, ucell, inp.sc_direction_only,
                static_cast<Parallel_Orbitals*>(pv),
                inp.nspin, kv, p_hamilt, psi,
                static_cast<elecstate::DensityMatrix<TK, double>*>(dm),
                static_cast<elecstate::ElecState*>(pelec));
 #else
+    // Non-LCAO build: no density matrix
     sc.init_sc(inp.sc_thr, inp.nsc, inp.nsc_min, inp.alpha_trial,
-               inp.sccut, inp.sc_drop_thr, ucell,
+               inp.sccut, inp.sc_drop_thr, ucell, inp.sc_direction_only,
                static_cast<Parallel_Orbitals*>(pv),
                inp.nspin, kv, p_hamilt, psi,
                static_cast<elecstate::ElecState*>(pelec));
 #endif
 }
 
+/**
+ * @brief Wrapper: calculate magnetic moments for current SCF iteration.
+ *
+ * @details If DeltaSpin is enabled, calls SpinConstrain::cal_mi_lcao().
+ * The moments are stored in Mi_ and can be retrieved via get_target_mag().
+ */
 template <typename TK>
 void cal_mi_lcao_wrapper(const int iter, const Input_para& inp)
 {
@@ -47,43 +89,69 @@ void cal_mi_lcao_wrapper(const int iter, const Input_para& inp)
     {
         return;
     }
-    
+
 #ifdef __LCAO
     spinconstrain::SpinConstrain<TK>& sc = spinconstrain::SpinConstrain<TK>::getScInstance();
     sc.cal_mi_lcao(iter);
 #endif
 }
 
+/**
+ * @brief Wrapper: run the lambda optimization loop.
+ *
+ * @details Decision logic for when to run the lambda loop:
+ *
+ *   Case 1: NOT converged AND charge density is close enough (drho < sc_scf_thr)
+ *   -> Run lambda loop, mark as converged, skip_solve = true
+ *   Rationale: The charge density is stable enough to optimize lambda.
+ *   The lambda loop does its own diagonalization, so skip the outer solve.
+ *
+ *   Case 2: Already converged
+ *   -> Still run lambda loop (to refine for the current charge density)
+ *   -> skip_solve = true
+ *   Rationale: Even if converged, the charge density may have changed
+ *   slightly, requiring lambda refinement.
+ *
+ *   Case 3: NOT converged AND charge density is NOT close enough (drho >= sc_scf_thr)
+ *   -> Do nothing, skip_solve = false
+ *   Rationale: The charge density is still changing significantly, so
+ *   optimizing lambda would be premature. Wait for SCF to stabilize first.
+ *
+ * @param iter Current SCF iteration number
+ * @param drho Charge density convergence criterion (max|drho|)
+ * @param inp Input parameters
+ * @return true if the ESolver should skip the Hamiltonian solve
+ */
 template <typename TK>
 bool run_deltaspin_lambda_loop_lcao(const int iter,
                                      const double drho,
                                      const Input_para& inp)
 {
     bool skip_solve = false;
-    
+
     if (inp.sc_mag_switch)
     {
         spinconstrain::SpinConstrain<TK>& sc = spinconstrain::SpinConstrain<TK>::getScInstance();
-        
+
         if (!sc.mag_converged() && drho > 0 && drho < inp.sc_scf_thr)
         {
-            /// optimize lambda to get target magnetic moments, but the lambda is not near target
+            /// Charge density is stable enough: optimize lambda for the first time
             sc.run_lambda_loop(iter);
             sc.set_mag_converged(true);
             skip_solve = true;
         }
         else if (sc.mag_converged())
         {
-            /// optimize lambda to get target magnetic moments, but the lambda is not near target
+            /// Already converged: refine lambda for the current charge density
             sc.run_lambda_loop(iter);
             skip_solve = true;
         }
     }
-    
+
     return skip_solve;
 }
 
-/// Template instantiation
+/// Template instantiations for both spin types
 template void init_deltaspin_lcao<double>(const UnitCell& ucell,
                                            const Input_para& inp,
                                            void* pv,
@@ -93,13 +161,13 @@ template void init_deltaspin_lcao<double>(const UnitCell& ucell,
                                            void* dm,
                                            void* pelec);
 template void init_deltaspin_lcao<std::complex<double>>(const UnitCell& ucell,
-                                                         const Input_para& inp,
-                                                         void* pv,
-                                                         const K_Vectors& kv,
-                                                         void* p_hamilt,
-                                                         void* psi,
-                                                         void* dm,
-                                                         void* pelec);
+                                                          const Input_para& inp,
+                                                          void* pv,
+                                                          const K_Vectors& kv,
+                                                          void* p_hamilt,
+                                                          void* psi,
+                                                          void* dm,
+                                                          void* pelec);
 
 template void cal_mi_lcao_wrapper<double>(const int iter, const Input_para& inp);
 template void cal_mi_lcao_wrapper<std::complex<double>>(const int iter, const Input_para& inp);
@@ -108,7 +176,7 @@ template bool run_deltaspin_lambda_loop_lcao<double>(const int iter,
                                                       const double drho,
                                                       const Input_para& inp);
 template bool run_deltaspin_lambda_loop_lcao<std::complex<double>>(const int iter,
-                                                                     const double drho,
-                                                                     const Input_para& inp);
+                                                                      const double drho,
+                                                                      const Input_para& inp);
 
 } // namespace ModuleESolver
diff --git a/source/source_lcao/module_deltaspin/init_sc.cpp b/source/source_lcao/module_deltaspin/init_sc.cpp
index ac56047173d..73da9388ec2 100644
--- a/source/source_lcao/module_deltaspin/init_sc.cpp
+++ b/source/source_lcao/module_deltaspin/init_sc.cpp
@@ -1,6 +1,28 @@
 #include "spin_constrain.h"
 
-// init sc
+/**
+ * @file init_sc.cpp
+ * @brief Master initialization for the SpinConstrain singleton.
+ *
+ * @par Called once at the start of a DeltaSpin calculation
+ * This function bridges the UnitCell/InputPara data from the ESolver layer
+ * to the internal SpinConstrain state. After init_sc(), the singleton is
+ * fully configured and ready for the SCF lambda optimization loop.
+ *
+ * @par Initialization order (critical):
+ * 1. Input parameters (convergence thresholds, step sizes)
+ * 2. Atom/orbital/lnchi counts (needed for array sizing)
+ * 3. nspin and npol (determines which code paths are used)
+ * 4. target_mag, lambda, constrain (from STRU parsing)
+ * 5. For nspin=2: force x,y constraint flags to 0 (collinear: only z constrained)
+ * 6. Parallel orbitals info (LCAO-specific)
+ * 7. Solver parameters (Hamiltonian, psi, electronic state pointers)
+ *
+ * @par Error conditions
+ * - If UnitCell.get_atom_Counts() returns empty map, subsequent operations will
+ *   fail with "atomCounts is not set" in check_atomCounts()
+ * - If nspin is not 2 or 4, set_nspin() will call WARNING_QUIT
+ */
 template <typename TK>
 void spinconstrain::SpinConstrain<TK>::init_sc(double sc_thr_in,
 		int nsc_in,
@@ -9,6 +31,7 @@ void spinconstrain::SpinConstrain<TK>::init_sc(double sc_thr_in,
 		double sccut_in,
 		double sc_drop_thr_in,
 		const UnitCell& ucell,
+		bool direction_only_in,
 		Parallel_Orbitals* ParaV_in,
 		int nspin_in,
 		const K_Vectors& kv_in,
@@ -20,24 +43,70 @@ void spinconstrain::SpinConstrain<TK>::init_sc(double sc_thr_in,
 		elecstate::ElecState* pelec_in,
 		ModulePW::PW_Basis_K* pw_wfc_in)
 {
+    // Step 1: Set input parameters for lambda loop
+    // - sc_thr: convergence threshold for RMS(Mi - M_target) in uB
+    // - nsc: maximum inner optimization steps
+    // - nsc_min: minimum steps before early exit checks
+    // - alpha_trial: initial trial step size (eV/uB^2), converted to Ry/uB^2
+    // - sccut: maximum lambda change per step (eV/uB), converted to Ry/uB
+    // - sc_drop_thr: fraction of initial RMS for adaptive threshold
     this->set_input_parameters(sc_thr_in, nsc_in, nsc_min_in, alpha_trial_in, sccut_in, sc_drop_thr_in);
+
+    // Step 2: Get atom/orbital/lnchi counts from UnitCell for indexing
+    // atomCounts: {element_type_index -> number_of_atoms_of_this_type}
+    // orbitalCounts: {element_type_index -> number_of_orbitals_per_atom}
+    // lnchiCounts: {element_type_index -> {angular_momentum_L -> number_of_chi_functions}}
     this->set_atomCounts(ucell.get_atom_Counts());
     this->set_orbitalCounts(ucell.get_orbital_Counts());
     this->set_lnchiCounts(ucell.get_lnchi_Counts());
+
+    // Step 3: Set spin configuration
+    // nspin=2: collinear (spin-up/down separate k-points), npol=1
+    // nspin=4: non-collinear (full spinor), npol=2
     this->set_nspin(nspin_in);
+    this->set_npol((nspin_in == 4) ? 2 : 1);
+
+    // Step 4: Load target magnetic moments and initial lambda from UnitCell
+    // These are parsed from the STRU file's "sc_mag" and "lambda" keywords
     this->set_target_mag(ucell.get_target_mag());
     this->lambda_ = ucell.get_lambda();
     this->constrain_ = ucell.get_constrain();
-    this->atomLabels_ = ucell.get_atomLabels();
-    this->tpiba = ucell.tpiba;
-    this->pw_wfc_ = pw_wfc_in;
-    this->set_decay_grad();
+
+    // Step 5: CRITICAL FIX for collinear spin (nspin=2)
+    // In collinear mode, spins are constrained along the z-axis only.
+    // The x and y components must be set to 0 to prevent the lambda optimizer
+    // from trying to constrain non-existent transverse components.
+    // Without this fix, the optimizer would waste iterations trying to
+    // drive Mx and My to their (usually non-zero) target values, which
+    // is physically meaningless for collinear calculations.
+    if (nspin_in == 2)
+    {
+        for (int iat = 0; iat < static_cast<int>(this->constrain_.size()); iat++)
+        {
+            this->constrain_[iat].x = 0;
+            this->constrain_[iat].y = 0;
+        }
+    }
+
+    // Step 6: Set auxiliary parameters
+    this->atomLabels_ = ucell.get_atomLabels();      // "Fe_0", "Fe_1", etc.
+    this->direction_only_ = direction_only_in;        // Only optimize spin direction
+    this->tpiba = ucell.tpiba;                        // 2*pi/a lattice scaling
+    this->pw_wfc_ = pw_wfc_in;                        // PW basis (PW mode only)
+    this->set_decay_grad();                           // Initialize gradient decay thresholds
+
+    // Step 7: Set parallel orbitals info (for ScaLAPACK distributed matrices)
     if(ParaV_in != nullptr) this->set_ParaV(ParaV_in);
+
+    // Step 8: Set solver parameters (pointers to external objects)
     this->set_solver_parameters(kv_in, p_hamilt_in, psi_in, pelec_in);
+
+    // Step 9: Set density matrix pointer (LCAO mode only)
 #ifdef __LCAO
     this->dm_ = dm_in; // mohan add 2025-11-03
 #endif
 }
 
+// Explicit template instantiations for both spin types
 template class spinconstrain::SpinConstrain<std::complex<double>>;
 template class spinconstrain::SpinConstrain<double>;
diff --git a/source/source_lcao/module_deltaspin/lambda_loop.cpp b/source/source_lcao/module_deltaspin/lambda_loop.cpp
index 5d38c5d2610..e279986f111 100644
--- a/source/source_lcao/module_deltaspin/lambda_loop.cpp
+++ b/source/source_lcao/module_deltaspin/lambda_loop.cpp
@@ -3,127 +3,101 @@
 #include <iostream>
 #include <cmath>
 #include <chrono>
+#include <fstream>
+#include <iomanip>
 
 #include "basic_funcs.h"
 #include "source_io/module_parameter/parameter.h"
+#include "source_base/constants.h"
 
-// lambda = initial_lambda + delta_lambda/(spin2 - spin1) * (target_spin - spin1)
-/*inline void next_lambda(std::vector<ModuleBase::Vector3<double>>& initial_lambda,
-                        std::vector<ModuleBase::Vector3<double>>& delta_lambda,
-                        std::vector<ModuleBase::Vector3<double>>& lambda,
-                        std::vector<ModuleBase::Vector3<double>>& spin1,
-                        std::vector<ModuleBase::Vector3<double>>& spin2,
-                        std::vector<ModuleBase::Vector3<double>>& target_spin)
-{
-    for (int ia = 0; ia < lambda.size(); ia++)
-    {
-        for (int ic = 0; ic < 3; ic++)
-        {
-            lambda[ia][ic] = initial_lambda[ia][ic] + delta_lambda[ia][ic] / (spin2[ia][ic] - spin1[ia][ic]) * (target_spin[ia][ic] - spin1[ia][ic]);
-        }
-    }
-}
-
+/**
+ * @file lambda_loop.cpp
+ * @brief Core lambda optimization algorithms for DeltaSpin.
+ *
+ * @par run_lambda_loop: Conjugate-gradient-like BFGS optimizer
+ * Iteratively adjusts Lagrange multipliers (lambda) to drive atomic magnetic
+ * moments (Mi) toward target values (M_target).
+ *
+ * @par Algorithm overview
+ * The optimizer follows a modified Polak-Ribiere conjugate gradient scheme:
+ *
+ *   Step -1 (Initialization):
+ *     - Compute initial Mi from current wavefunction
+ *     - Save initial lambda (lambda with unconstrained components zeroed)
+ *     - Set adaptive convergence threshold: current_sc_thr_ = max(rms_0 * sc_drop_thr_, sc_thr_)
+ *
+ *   Each inner step (i_step = 0, 1, ..., nsc-1):
+ *     1. Update lambda: lambda = initial_lambda + delta_lambda
+ *     2. [direction_only] Project out parallel component of lambda
+ *     3. cal_mw_from_lambda(): apply lambda -> solve -> compute new Mi
+ *     4. Check gradient decay: if dM/dlambda < decay_grad, exit early
+ *     5. Compute residual: delta_spin = Mi - M_target
+ *     6. Compute RMS error: rms = sqrt(mean(delta_spin^2))
+ *     7. Check convergence: if rms < current_sc_thr_, update_psi_charge() and exit
+ *        [PW basis] Re-check with cal_mi_pw(), recursively rerun if RMS too large
+ *     8. [i_step >= 2] Compute Polak-Ribiere beta = rms^2 / rms_old^2
+ *     9. Update search direction: search = delta_spin + beta * search_old
+ *     10. Apply restriction: cap alpha_trial so that |alpha_trial * search| < restrict_current_
+ *     11. Compute cumulative step: dnu = dnu + alpha_trial * search
+ *     12. [direction_only] Project out parallel component of dnu
+ *     13. Trial step: compute Mi at dnu, find optimal alpha via linear interpolation
+ *     14. Update dnu with optimal alpha
+ *     15. Adapt alpha_trial: if |alpha_opt| >> alpha_trial, increase; else decrease
+ *
+ * @par Key variables
+ * - initial_lambda: lambda with unconstrained components set to 0
+ * - delta_lambda: current lambda change from initial
+ * - dnu: cumulative lambda change (search path integral)
+ * - search: current search direction (steepest descent or conjugate)
+ * - spin, spin_plus: Mi at current and trial lambda values
+ * - alpha_trial: current step size (adaptively adjusted)
+ * - alpha_opt: optimal step size from linear interpolation
+ *
+ * @par Convergence criteria
+ * 1. RMS(Mi - M_target) < current_sc_thr_ (adaptive threshold)
+ * 2. Maximum gradient dM/dlambda < decay_grad[itype] per atom type
+ * 3. Maximum steps reached (nsc)
+ *
+ * @par Error output and solutions
+ * - "RMS error is too large, rerun the loop": The subspace diagonalization
+ *   was not accurate enough. The loop is rerun with rerun=false to use the
+ *   full PW solver for better precision. If this persists, check:
+ *   - PW_DIAG_NMAX and PW_DIAG_THR in DiagoIterAssist
+ *   - higher_mag_prec flag for forced high-precision mode
+ * - "Reach maximum number of steps": Lambda optimization did not converge
+ *   within nsc steps. Check:
+ *   - target_mag values are physically reasonable
+ *   - alpha_trial is not too small (slow convergence)
+ *   - decay_grad thresholds are not too aggressive
+ */
 template <>
-void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(int outer_step)
+void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(int outer_step, bool rerun)
 {
-    // init parameters
-    int nat = this->get_nat();
-    std::vector<ModuleBase::Vector3<double>> initial_lambda(nat, 0.0);
-    std::vector<ModuleBase::Vector3<double>> delta_lambda(nat, 0.0);
-    std::vector<ModuleBase::Vector3<double>> spin1(nat, 0.0);
-    std::vector<ModuleBase::Vector3<double>> spin2(nat, 0.0);
-    std::vector<ModuleBase::Vector3<double>> delta_spin(nat, 0.0);
-    // current lambda is this->lambda_
-    // current spin is this->Mi_
-    // target spin is this->target_mag_
-    // loop to optimize lambda to get target spin
-    int step = -1;
-    do
-    {
-        // set initial lambda
-        where_fill_scalar_else_2d(this->constrain_, 0, 0.0, this->lambda_, initial_lambda);
-        // save current spin to spin1 if step > 0
-        if (step > 0)
-        {
-            spin1 = this->Mi_;
-        }
-        // calculate current spin
-        this->cal_mw_from_lambda(step);
-        // save current spin to spin2
-        spin2 = this->Mi_;
-        // calculate delta_spin = target_spin - spin
-        subtract_2d(this->target_mag_, spin2, delta_spin);
-        // check RMS error and stop if needed
-        // calculate RMS error
-        double sum = 0.0;
-        for (int ia = 0; ia < nat; ia++)
-        {
-            for (int ic = 0; ic < 3; ic++)
-            {
-                sum += std::pow(delta_spin[ia][ic],2);
-            }
-        }
-        double rms_error = std::sqrt(sum/nat);
-        std::cout << "RMS error = " << rms_error <<" in step:" <<step << std::endl;
-        // check RMS error and stop if needed
-        if(rms_error < 1e-5)
-        {
-            std::cout<<"success"<<std::endl;
-            break;
-        }
-        // calculate delta_lambda
-        if(1)//step == 0)
-        {
-            for(int ia = 0; ia < nat; ia++)
-            {
-                for(int ic = 2; ic < 3; ic++)
-                {
-                    delta_lambda[ia][ic] = 0.01;//- delta_spin[ia][ic] / 10.0;
-                    this->lambda_[ia][ic] = initial_lambda[ia][ic] + delta_lambda[ia][ic];
-                    std::cout<<__LINE__<<"lambda["<<ia<<"] = "<<this->lambda_[ia][ic]<<std::endl;
-                }
-            }
-        }
-        else
-        {
-            //calculate next lambda
-            next_lambda(initial_lambda, delta_lambda, this->lambda_, spin1, spin2, this->target_mag_);
-            // calculate delta_lambda = this->lambda - initial_lambda
-            subtract_2d(this->lambda_, initial_lambda, delta_lambda);
-        }
-        step++;
-    } while (step < this->nsc_);
-    
-}*/
-
-
-template <>
-void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
-        int outer_step,
-		bool rerun)
-{
-    // init controlling parameters
     int nat = this->get_nat();
     int ntype = this->get_ntype();
-    std::vector<ModuleBase::Vector3<double>> initial_lambda(nat,0.0);
-    std::vector<ModuleBase::Vector3<double>> delta_lambda(nat,0.0);
-    // set nu, dnu and dnu_last_step
-    std::vector<ModuleBase::Vector3<double>> dnu(nat, 0.0), dnu_last_step(nat, 0.0);
-    // two controlling temp variables
-    std::vector<ModuleBase::Vector3<double>> temp_1(nat, 0.0);
-    std::vector<ModuleBase::Vector3<double>> spin(nat, 0.0), delta_spin(nat, 0.0);
-    std::vector<ModuleBase::Vector3<double>> search(nat, 0.0), search_old(nat, 0.0);
-    std::vector<ModuleBase::Vector3<double>> new_spin(nat, 0.0), spin_plus(nat, 0.0);
 
-    double alpha_opt, alpha_plus;
-    double beta = 0.0, g = 0.0, mean_error = 0.0, mean_error_old = 0.0, rms_error = 0.0;
+    // =============================================================
+    // STATE VECTORS (all sized [nat][3])
+    // =============================================================
+    std::vector<ModuleBase::Vector3<double>> initial_lambda(nat,0.0); ///< Lambda with unconstrained components = 0
+    std::vector<ModuleBase::Vector3<double>> delta_lambda(nat,0.0);   ///< Current lambda change from initial
+    std::vector<ModuleBase::Vector3<double>> dnu(nat, 0.0), dnu_last_step(nat, 0.0); ///< Cumulative step, previous step
+    std::vector<ModuleBase::Vector3<double>> temp_1(nat, 0.0);        ///< Temporary workspace
+    std::vector<ModuleBase::Vector3<double>> spin(nat, 0.0), delta_spin(nat, 0.0);   ///< Current Mi, residual (Mi - M_target)
+    std::vector<ModuleBase::Vector3<double>> search(nat, 0.0), search_old(nat, 0.0); ///< Search direction, previous direction
+    std::vector<ModuleBase::Vector3<double>> new_spin(nat, 0.0), spin_plus(nat, 0.0); ///< Mi at current and trial lambda
+
+    double alpha_opt, alpha_plus;  ///< Optimal step size, correction to trial
+    double beta;                    ///< Polak-Ribiere conjugate gradient parameter
+    double mean_error, mean_error_old, rms_error; ///< Mean squared error, RMS error
+    double g;                       ///< Adaptation factor for alpha_trial
 
-    double alpha_trial = this->alpha_trial_;
+    double alpha_trial = this->alpha_trial_; ///< Current trial step size (Ry/uB^2)
 
     const double zero = 0.0;
     const double one = 1.0;
 
+    // Timer initialization (MPI or CPU)
 #ifdef __MPI
 	auto iterstart = MPI_Wtime();
 #else
@@ -133,16 +107,27 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
     double inner_loop_duration = 0.0;
 
     this->print_header();
-    // lambda loop
+
+    // =============================================================
+    // MAIN OPTIMIZATION LOOP
+    // i_step = -1: initialization (compute initial Mi, save initial lambda)
+    // i_step = 0, 1, ..., nsc-1: optimization steps
+    // =============================================================
     for (int i_step = -1; i_step < this->nsc_; i_step++)
     {
         double duration = 0.0;
         if (i_step == -1)
         {
-
+            // =============================================================
+            // STEP -1: INITIALIZATION
+            // Compute initial magnetic moments and save starting state
+            // =============================================================
             this->cal_mw_from_lambda(i_step);
             spin = this->Mi_;
+
+            // Save initial lambda: for unconstrained components (constrain==0), set to 0
             where_fill_scalar_else_2d(this->constrain_, 0, zero, this->lambda_, initial_lambda);
+
             print_2d("initial lambda (eV/uB): ", initial_lambda, this->nspin_, ModuleBase::Ry_to_eV);
             print_2d("initial spin (uB): ", spin, this->nspin_);
             print_2d("target spin (uB): ", this->target_mag_, this->nspin_);
@@ -150,17 +135,48 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
         }
         else
         {
-            where_fill_scalar_else_2d(this->constrain_, 0, zero, delta_lambda, delta_lambda);
+            // =============================================================
+            // OPTIMIZATION STEP
+            // Update lambda, compute new Mi, check convergence
+            // =============================================================
+
+            // Mask unconstrained components of delta_lambda to 0
+            where_fill_scalar_2d(this->constrain_, 0, zero, delta_lambda);
+
+            // lambda = initial_lambda + delta_lambda
             add_scalar_multiply_2d(initial_lambda, delta_lambda, one, this->lambda_);
 
-            this->cal_mw_from_lambda(i_step);
+            // [direction_only mode] Project out parallel component of lambda
+            // This keeps |lambda| -> 0, only constraining spin direction
+            if(this->direction_only_)
+            for (int ia = 0; ia < nat; ia++)
+            {
+                const auto& target = this->target_mag_[ia];
+                const double norm = std::sqrt(target.x*target.x + target.y*target.y + target.z*target.z);
+
+                if (norm > 1e-8) {
+                    const ModuleBase::Vector3<double> dir = target / norm;
+                    double parallel = this->lambda_[ia].x*dir.x +
+                                    this->lambda_[ia].y*dir.y +
+                                    this->lambda_[ia].z*dir.z;
+                    this->lambda_[ia].x -= parallel * dir.x;
+                    this->lambda_[ia].y -= parallel * dir.y;
+                    this->lambda_[ia].z -= parallel * dir.z;
+                }
+            }
 
+            // Apply lambda and compute new magnetic moments
+            this->cal_mw_from_lambda(i_step, delta_lambda.data());
             new_spin = this->Mi_;
+
+            // Check if gradient dM/dlambda has decayed below threshold
             bool GradLessThanBound = this->check_gradient_decay(new_spin, spin, delta_lambda, dnu_last_step);
             if (i_step >= this->nsc_min_ && GradLessThanBound)
             {
+                // Gradient has decayed: further optimization yields diminishing returns
+                // Apply the last successful step and exit
                 add_scalar_multiply_2d(initial_lambda, dnu_last_step, one, this->lambda_);
-                this->update_psi_charge(dnu_last_step.data());
+                this->update_psi_charge(dnu_last_step.data(), true, true);
 #ifdef __MPI
 		        duration = (double)(MPI_Wtime() - iterstart);
 #else
@@ -175,10 +191,46 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
             }
             spin = new_spin;
         }
-        // continue the lambda loop
+
+        // =============================================================
+        // COMPUTE RESIDUAL AND RMS ERROR
+        // =============================================================
+        // delta_spin = spin - target_mag (residual error)
         subtract_2d(spin, this->target_mag_, delta_spin);
+        // Mask unconstrained components to 0 (they don't contribute to error)
         where_fill_scalar_2d(this->constrain_, 0, zero, delta_spin);
+
+        // Search direction starts as the residual (steepest descent)
         search = delta_spin;
+
+        // [direction_only mode] Modify residual to exclude parallel component
+        // and adjust target_mag to maintain direction constraint
+        if(this->direction_only_)
+        for (int ia = 0; ia < nat; ia++)
+        {
+            const auto& target = this->target_mag_[ia];
+            const double norm = std::sqrt(target.x*target.x + target.y*target.y + target.z*target.z);
+
+            if (norm > 1e-8) {
+                const ModuleBase::Vector3<double> dir = target / norm;
+                const double parallel = delta_spin[ia].x*dir.x + delta_spin[ia].y*dir.y + delta_spin[ia].z*dir.z;
+                // Store perpendicular component squared in temp_1 (for RMS)
+                temp_1[ia][0] = std::pow(delta_spin[ia].x,2) + std::pow(delta_spin[ia].y,2) +
+                                std::pow(delta_spin[ia].z,2) - std::pow(parallel,2);
+                temp_1[ia][1] = 0;
+                temp_1[ia][2] = 0;
+                // Adjust target to include parallel component
+                this->target_mag_[ia] += parallel * dir;
+            }
+            else {
+                temp_1[ia][0] = std::pow(delta_spin[ia].x,2) +
+                              std::pow(delta_spin[ia].y,2) +
+                              std::pow(delta_spin[ia].z,2);
+                temp_1[ia][1] = 0;
+                temp_1[ia][2] = 0;
+            }
+        }
+        else
         for (int ia = 0; ia < nat; ia++)
         {
             for (int ic = 0; ic < 3; ic++)
@@ -188,11 +240,16 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
         }
         mean_error = sum_2d(temp_1) / nat;
         rms_error = std::sqrt(mean_error);
+
+        // Set adaptive convergence threshold on first step
         if(i_step == 0)
         {
-            // set current_sc_thr_ to max(rms_error * sc_drop_thr, this->sc_thr_)
             this->current_sc_thr_ = std::max(rms_error * this->sc_drop_thr_, this->sc_thr_);
         }
+
+        // =============================================================
+        // CHECK CONVERGENCE
+        // =============================================================
 #ifdef __MPI
 			duration = (double)(MPI_Wtime() - iterstart);
 #else
@@ -203,11 +260,12 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
         inner_loop_duration += duration;
         if (this->check_rms_stop(outer_step, i_step, rms_error, duration, inner_loop_duration))
         {
-            //add_scalar_multiply_2d(initial_lambda, dnu_last_step, 1.0, this->lambda_);
-            this->update_psi_charge(dnu_last_step.data(), rerun);
+            // Converged or max steps reached: final update
+            this->update_psi_charge(dnu_last_step.data(), rerun, true);
+
+            // [PW basis] Extra verification: re-compute Mi from scratch
             if(PARAM.inp.basis_type == "pw")
             {
-                //double check Atomic spin moment
                 this->cal_mi_pw();
                 subtract_2d(this->Mi_, this->target_mag_, delta_spin);
                 where_fill_scalar_2d(this->constrain_, 0, zero, delta_spin);
@@ -222,6 +280,9 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
                 mean_error = sum_2d(temp_1) / nat;
                 rms_error = std::sqrt(mean_error);
                 std::cout<<"Current RMS: "<<rms_error<<std::endl;
+
+                // If RMS is still large after full update, recursively rerun
+                // with higher precision (full PW solver instead of subspace only)
                 if(rms_error > this->current_sc_thr_ * 10 && rerun == true && this->higher_mag_prec == true)
                 {
                     std::cout<<"Error: RMS error is too large, rerun the loop"<<std::endl;
@@ -230,42 +291,99 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
             }
             break;
         }
+
+        // Reset timer for next iteration
 #ifdef __MPI
 		iterstart = MPI_Wtime();
 #else
 		iterstart = std::chrono::system_clock::now();
 #endif
+
+        // =============================================================
+        // POLAK-RIBIERE CONJUGATE GRADIENT UPDATE
+        // =============================================================
+        // For i_step >= 2, compute conjugate direction
         if (i_step >= 2)
         {
+            // Polak-Ribiere beta = ||gradient_new||^2 / ||gradient_old||^2
             beta = mean_error / mean_error_old;
+            // search = delta_spin + beta * search_old (conjugate direction)
             add_scalar_multiply_2d(search, search_old, beta, search);
         }
-        /// check if restriction is needed
+
+        // Cap step size to prevent overshooting
         this->check_restriction(search, alpha_trial);
 
+        // =============================================================
+        // CUMULATIVE STEP UPDATE
+        // =============================================================
         dnu_last_step = dnu;
+        // dnu = dnu + alpha_trial * search
         add_scalar_multiply_2d(dnu, search, alpha_trial, dnu);
+
+        // [direction_only] Project out parallel component from dnu
+        if(this->direction_only_)
+        for (int ia = 0; ia < nat; ia++) {
+            const auto& target = this->target_mag_[ia];
+            const double norm = std::sqrt(target.x*target.x + target.y*target.y + target.z*target.z);
+
+            if (norm > 1e-8) {
+                const ModuleBase::Vector3<double> dir = target / norm;
+                double parallel = dnu[ia].x*dir.x + dnu[ia].y*dir.y + dnu[ia].z*dir.z;
+                dnu[ia].x -= parallel * dir.x;
+                dnu[ia].y -= parallel * dir.y;
+                dnu[ia].z -= parallel * dir.z;
+            }
+        }
         delta_lambda = dnu;
 
+        // Mask unconstrained components
         where_fill_scalar_else_2d(this->constrain_, 0, zero, delta_lambda, delta_lambda);
+        // Update lambda
         add_scalar_multiply_2d(initial_lambda, delta_lambda, one, this->lambda_);
 
+        // =============================================================
+        // TRIAL STEP: compute Mi at trial position
+        // =============================================================
         this->cal_mw_from_lambda(i_step, delta_lambda.data());
-
         spin_plus = this->Mi_;
 
+        // Find optimal step size via linear interpolation
         alpha_opt = this->cal_alpha_opt(spin, spin_plus, alpha_trial);
-        /// check if restriction is needed
         this->check_restriction(search, alpha_opt);
 
+        // Correct dnu: dnu += (alpha_opt - alpha_trial) * search
         alpha_plus = alpha_opt - alpha_trial;
         scalar_multiply_2d(search, alpha_plus, temp_1);
         add_scalar_multiply_2d(dnu, temp_1, one, dnu);
+
+        // [direction_only] Project out parallel component from corrected dnu
+        if(this->direction_only_)
+        for (int ia = 0; ia < nat; ia++) {
+            const auto& target = this->target_mag_[ia];
+            const double norm = std::sqrt(target.x*target.x + target.y*target.y + target.z*target.z);
+
+            if (norm > 1e-8) {
+                const ModuleBase::Vector3<double> dir = target / norm;
+                double parallel = dnu[ia].x*dir.x + dnu[ia].y*dir.y + dnu[ia].z*dir.z;
+                dnu[ia].x -= parallel * dir.x;
+                dnu[ia].y -= parallel * dir.y;
+                dnu[ia].z -= parallel * dir.z;
+            }
+        }
         delta_lambda = dnu;
 
+        // =============================================================
+        // ADAPT STEP SIZE FOR NEXT ITERATION
+        // =============================================================
         search_old = search;
         mean_error_old = mean_error;
 
+        // Adapt alpha_trial based on ratio of optimal to trial step
+        // g = 1.5 * |alpha_opt| / alpha_trial
+        // - g > 2.0: alpha_opt was much larger than alpha_trial -> increase alpha_trial
+        // - g < 0.5: alpha_opt was much smaller -> decrease alpha_trial
+        // - 0.5 <= g <= 2.0: step size is reasonable -> modest adjustment
         g = 1.5 * std::abs(alpha_opt) / alpha_trial;
         if (g > 2.0)
         {
@@ -280,3 +398,217 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(
 
     return;
 }
+
+/**
+ * @file lambda_loop.cpp (continued)
+ * @brief Linear lambda scan mode for energy landscape mapping.
+ *
+ * @par Purpose
+ * Instead of optimizing lambda to match target moments, this function
+ * sweeps lambda values from sc_scan_lambda_start to sc_scan_lambda_end
+ * in equal steps, computing Mi at each point. Useful for:
+ * - Debugging: understanding the Mi vs lambda relationship
+ * - Plotting: creating E(lambda) curves for analysis
+ * - Validation: checking that Mi responds monotonically to lambda
+ *
+ * @par Output
+ * Results written to lambda_scan_results.dat with columns:
+ *   step, lambda_eV_uB, Mi_x_0, Mi_y_0, Mi_z_0, Mi_x_1, ...
+ */
+template <>
+void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_linear_scan(int outer_step)
+{
+    int nat = this->get_nat();
+    int ntype = this->get_ntype();
+
+    double lambda_start = PARAM.inp.sc_scan_lambda_start;
+    double lambda_end = PARAM.inp.sc_scan_lambda_end;
+    int nsteps = PARAM.inp.sc_scan_steps;
+
+    if (nsteps <= 0) {
+        std::cout << "[DS-DIAG] linear_scan: sc_scan_steps <= 0, skipping" << std::endl;
+        return;
+    }
+
+    // Convert eV to Ry for internal calculations
+    double lambda_start_ry = lambda_start / ModuleBase::Ry_to_eV;
+    double lambda_end_ry = lambda_end / ModuleBase::Ry_to_eV;
+    double lambda_step = (lambda_end_ry - lambda_start_ry) / (nsteps - 1);
+
+    std::cout << "\n" << std::string(80, '=') << std::endl;
+    std::cout << "[DS-DIAG] === LINEAR LAMBDA SCAN START ===" << std::endl;
+    std::cout << "[DS-DIAG] Scan range: " << lambda_start << " -> " << lambda_end << " eV/uB" << std::endl;
+    std::cout << "[DS-DIAG] Number of steps: " << nsteps << std::endl;
+    std::cout << "[DS-DIAG] Lambda step size: " << lambda_step * ModuleBase::Ry_to_eV << " eV/uB" << std::endl;
+    std::cout << "[DS-DIAG] nat = " << nat << ", ntype = " << ntype << std::endl;
+    std::cout << "[DS-DIAG] nspin_ = " << this->nspin_ << ", npol_ = " << this->npol_ << std::endl;
+    std::cout << "[DS-DIAG] p_operator = " << (this->p_operator ? "valid" : "NULL") << std::endl;
+    std::cout << "[DS-DIAG] constrain_ size = " << this->constrain_.size() << std::endl;
+
+    // Check if any constraints are defined; if not, set all atoms as constrained
+    bool has_constraints = false;
+    for (int ia = 0; ia < nat; ia++) {
+        if (this->constrain_[ia].x != 0 || this->constrain_[ia].y != 0 || this->constrain_[ia].z != 0) {
+            has_constraints = true;
+            break;
+        }
+    }
+
+    if (!has_constraints) {
+        std::cout << "[DS-DIAG] No constraints found in STRU, setting all atoms as constrained" << std::endl;
+        for (int ia = 0; ia < nat; ia++) {
+            if (this->nspin_ == 4) {
+                this->constrain_[ia] = ModuleBase::Vector3<int>(1, 1, 1);
+            } else {
+                this->constrain_[ia] = ModuleBase::Vector3<int>(0, 0, 1);
+            }
+        }
+        this->reset_dspin_operator();
+    }
+
+    for (int ia = 0; ia < nat; ia++) {
+        std::cout << "[DS-DIAG]   Atom " << ia << " constrain = ("
+                  << this->constrain_[ia].x << ", " << this->constrain_[ia].y << ", " << this->constrain_[ia].z << ")"
+                  << " target_mag = (" << this->target_mag_[ia].x << ", " << this->target_mag_[ia].y << ", " << this->target_mag_[ia].z << ")" << std::endl;
+    }
+    std::cout << std::string(80, '=') << "\n" << std::endl;
+
+    // Save initial lambda to restore after scan
+    std::vector<ModuleBase::Vector3<double>> initial_lambda(nat, 0.0);
+    where_fill_scalar_else_2d(this->constrain_, 0, 0.0, this->lambda_, initial_lambda);
+
+    // Open output file
+    std::ofstream ofs_scan;
+    if (outer_step == 0) {
+        ofs_scan.open("lambda_scan_results.dat");
+        ofs_scan << "# Linear Lambda Scan Results" << std::endl;
+        ofs_scan << "# lambda_start = " << lambda_start << " eV/uB" << std::endl;
+        ofs_scan << "# lambda_end = " << lambda_end << " eV/uB" << std::endl;
+        ofs_scan << "# nsteps = " << nsteps << std::endl;
+        ofs_scan << "#" << std::endl;
+        ofs_scan << "# SCF iteration: " << outer_step << std::endl;
+    } else {
+        ofs_scan.open("lambda_scan_results.dat", std::ios::app);
+        ofs_scan << "#" << std::endl;
+        ofs_scan << "# SCF iteration: " << outer_step << std::endl;
+    }
+
+    // Write header
+    ofs_scan << "# step  lambda_eV_uB";
+    for (int ia = 0; ia < nat; ia++) {
+        ofs_scan << "  Mi_x_" << ia << "  Mi_y_" << ia << "  Mi_z_" << ia;
+    }
+    ofs_scan << std::endl;
+
+    double original_sc_thr = this->sc_thr_;
+
+    // Save step 0 Mi for consistency check later
+    std::vector<ModuleBase::Vector3<double>> mi_step0;
+
+    // =============================================================
+    // SCAN LOOP: sweep lambda from start to end
+    // =============================================================
+    for (int istep = 0; istep < nsteps; istep++) {
+        double lambda_val_ry = lambda_start_ry + istep * lambda_step;
+        double lambda_val_ev = lambda_val_ry * ModuleBase::Ry_to_eV;
+
+        // Set lambda for all constrained atoms/components
+        for (int ia = 0; ia < nat; ia++) {
+            for (int ic = 0; ic < 3; ic++) {
+                if (this->constrain_[ia][ic] != 0) {
+                    this->lambda_[ia][ic] = lambda_val_ry;
+                } else {
+                    this->lambda_[ia][ic] = 0.0;
+                }
+            }
+        }
+
+        std::cout << "[DS-DIAG] === Scan step " << istep << "/" << nsteps
+                  << " lambda = " << lambda_val_ev << " eV/uB ===" << std::endl;
+
+        // Compute magnetic moments at current lambda
+        this->cal_mw_from_lambda(istep);
+
+        // Save step 0 Mi for consistency verification
+        if (istep == 0) {
+            mi_step0 = this->Mi_;
+        }
+
+        // Write results
+        ofs_scan << std::scientific << std::setprecision(6);
+        ofs_scan << istep << "  " << lambda_val_ev;
+        for (int ia = 0; ia < nat; ia++) {
+            ofs_scan << "  " << this->Mi_[ia].x
+                     << "  " << this->Mi_[ia].y
+                     << "  " << this->Mi_[ia].z;
+        }
+        ofs_scan << std::endl;
+
+        std::cout << "[DS-DIAG]   lambda = " << lambda_val_ev << " eV/uB" << std::endl;
+        for (int ia = 0; ia < nat; ia++) {
+            std::cout << "[DS-DIAG]   Atom " << ia << " Mi = ("
+                      << this->Mi_[ia].x << ", "
+                      << this->Mi_[ia].y << ", "
+                      << this->Mi_[ia].z << ") uB" << std::endl;
+        }
+        std::cout << std::endl;
+    }
+
+    // =============================================================
+    // CONSISTENCY CHECK: restore initial lambda and recompute Mi
+    // to verify that the lambda->Mi mapping is numerically stable
+    // after multiple lambda updates in the scan loop
+    // =============================================================
+    std::cout << "[DS-DIAG] === Consistency check: restoring initial lambda ===" << std::endl;
+    this->lambda_ = initial_lambda;
+    this->cal_mw_from_lambda(nsteps);
+
+    // Write consistency check result
+    ofs_scan << std::scientific << std::setprecision(6);
+    ofs_scan << "init_recheck  " << lambda_start;
+    for (int ia = 0; ia < nat; ia++) {
+        ofs_scan << "  " << this->Mi_[ia].x
+                 << "  " << this->Mi_[ia].y
+                 << "  " << this->Mi_[ia].z;
+    }
+    ofs_scan << std::endl;
+
+    std::cout << "[DS-DIAG]   lambda = " << lambda_start << " eV/uB (restored)" << std::endl;
+    for (int ia = 0; ia < nat; ia++) {
+        std::cout << "[DS-DIAG]   Atom " << ia << " Mi = ("
+                  << this->Mi_[ia].x << ", "
+                  << this->Mi_[ia].y << ", "
+                  << this->Mi_[ia].z << ") uB" << std::endl;
+    }
+
+    // Compare restored Mi with step 0 Mi to check consistency
+    ofs_scan << "# [consistency] step 0 vs init_recheck Mi difference:" << std::endl;
+    double max_mi_diff = 0.0;
+    for (int ia = 0; ia < nat; ia++) {
+        double dx = std::abs(this->Mi_[ia].x - mi_step0[ia].x);
+        double dy = std::abs(this->Mi_[ia].y - mi_step0[ia].y);
+        double dz = std::abs(this->Mi_[ia].z - mi_step0[ia].z);
+        double diff = std::max({dx, dy, dz});
+        if (diff > max_mi_diff) max_mi_diff = diff;
+        ofs_scan << "#   Atom " << ia << " dM = (" << dx << ", " << dy << ", " << dz << ") uB" << std::endl;
+    }
+    std::cout << "[DS-DIAG] Max Mi difference between step 0 and init_recheck: " << max_mi_diff << " uB" << std::endl;
+    if (max_mi_diff > 1e-8) {
+        std::cout << "[DS-DIAG] WARNING: Mi mapping may be inconsistent after multiple lambda updates!" << std::endl;
+    } else {
+        std::cout << "[DS-DIAG] OK: Mi mapping is consistent." << std::endl;
+    }
+    ofs_scan << "#   Max Mi difference: " << max_mi_diff << " uB" << std::endl;
+
+    ofs_scan.close();
+
+    // Restore original lambda values (already restored above, but explicit for clarity)
+    this->lambda_ = initial_lambda;
+
+    std::cout << std::string(80, '=') << std::endl;
+    std::cout << "[DS-DIAG] === LINEAR LAMBDA SCAN COMPLETE ===" << std::endl;
+    std::cout << "[DS-DIAG] Results written to: lambda_scan_results.dat" << std::endl;
+    std::cout << std::string(80, '=') << "\n" << std::endl;
+
+    return;
+}
diff --git a/source/source_lcao/module_deltaspin/lambda_loop_helper.cpp b/source/source_lcao/module_deltaspin/lambda_loop_helper.cpp
index 6ad4db05adb..9f101bd13f7 100644
--- a/source/source_lcao/module_deltaspin/lambda_loop_helper.cpp
+++ b/source/source_lcao/module_deltaspin/lambda_loop_helper.cpp
@@ -1,6 +1,33 @@
 #include "basic_funcs.h"
 #include "spin_constrain.h"
 
+/**
+ * @file lambda_loop_helper.cpp
+ * @brief Helper/auxiliary methods for the lambda optimization loop.
+ *
+ * @par Functions overview
+ * - print_termination(): Print final spin and lambda values when loop exits
+ * - check_rms_stop(): Check convergence and print step info
+ * - print_header(): Print header at loop start
+ * - check_restriction(): Cap step size to prevent overshooting
+ * - cal_alpha_opt(): Compute optimal step size via linear interpolation
+ * - check_gradient_decay(): Check if dM/dlambda has decayed below threshold
+ */
+
+/**
+ * @brief Print final spin and lambda values when lambda loop terminates.
+ *
+ * @par Output
+ * - "after-optimization spin (uB)": Final magnetic moments Mi for each atom
+ * - "after-optimization lambda (eV/uB)": Final Lagrange multipliers for each atom
+ * - "Inner optimization for lambda ends.": Termination marker
+ *
+ * @par Interpretation
+ * - Mi close to target_mag: constraint successfully satisfied
+ * - Mi far from target_mag: constraint not converged (check RMS error in log)
+ * - lambda ≈ 0: system naturally has the target moment
+ * - lambda large: system resists the constraint (may indicate unrealistic target)
+ */
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::print_termination()
 {
@@ -10,6 +37,26 @@ void spinconstrain::SpinConstrain<std::complex<double>>::print_termination()
     std::cout << "===============================================================================" << std::endl;
 }
 
+/**
+ * @brief Check if RMS error is below convergence threshold or max steps reached.
+ *
+ * @par Output
+ * Prints step info: "Step (Outer -- Inner) = X -- Y   RMS = Z   TIME(s) = T"
+ *
+ * @par Termination messages
+ * - "Meet convergence criterion": RMS < current_sc_thr_ (successfully converged)
+ * - "Reach maximum number of steps": i_step == nsc_ - 1 (did not converge)
+ *
+ * @par Return value
+ * - true: loop should terminate (either converged or max steps)
+ * - false: continue optimization
+ *
+ * @param outer_step Current SCF outer iteration
+ * @param i_step Current inner lambda step
+ * @param rms_error Current RMS error of Mi - M_target
+ * @param duration Time for this step
+ * @param total_duration Cumulative time for inner loop
+ */
 template <>
 bool spinconstrain::SpinConstrain<std::complex<double>>::check_rms_stop(int outer_step,
                                                                                   int i_step,
@@ -37,7 +84,7 @@ bool spinconstrain::SpinConstrain<std::complex<double>>::check_rms_stop(int oute
     return false;
 }
 
-/// print header
+/// @brief Print header at start of lambda optimization loop
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::print_header()
 {
@@ -46,7 +93,22 @@ void spinconstrain::SpinConstrain<std::complex<double>>::print_header()
     std::cout << "Covergence criterion for the iteration: " << this->sc_thr_ << std::endl;
 }
 
-/// check restriction
+/**
+ * @brief Cap step size to prevent overshooting in lambda optimization.
+ *
+ * @details If |alpha_trial * max(search)| > restrict_current_, reduce alpha_trial
+ * so that the maximum lambda change per step is bounded by restrict_current_.
+ *
+ * This prevents the optimizer from taking steps that are too large, which
+ * could lead to oscillation or divergence.
+ *
+ * @par Output (when restriction is applied)
+ * - "alpha after restrict = X eV/uB^2": The capped step size
+ * - "boundary after = X eV/uB": The actual maximum lambda change
+ *
+ * @param search Current search direction
+ * @param alpha_trial Trial step size (modified in place if capped)
+ */
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::check_restriction(
     const std::vector<ModuleBase::Vector3<double>>& search,
@@ -63,7 +125,32 @@ void spinconstrain::SpinConstrain<std::complex<double>>::check_restriction(
     }
 }
 
-/// calculate alpha_opt
+/**
+ * @brief Compute optimal step size via linear interpolation.
+ *
+ * @par Algorithm
+ * Uses the two-point linear interpolation (secant method) to find the
+ * step size that would drive Mi to M_target:
+ *
+ *   alpha_opt = sum_k / sum_k2 * alpha_trial
+ *
+ * where:
+ *   sum_k  = sum((target - spin) . (spin_plus - spin))   over constrained components
+ *   sum_k2 = sum(|spin - spin_plus|^2)                   over constrained components
+ *
+ * This is equivalent to finding the minimum of a quadratic approximation
+ * to E(lambda) along the search direction.
+ *
+ * @par Edge case handling
+ * - If |sum_k2| < 1e-30: spin and spin_plus are nearly identical, meaning
+ *   the lambda change has no effect on Mi. Return alpha_trial as fallback.
+ *   This can happen if the system is already saturated or if lambda is too small.
+ *
+ * @param spin Mi at current lambda
+ * @param spin_plus Mi at trial lambda (current + alpha_trial * search)
+ * @param alpha_trial Current trial step size
+ * @return Optimal step size alpha_opt
+ */
 template <>
 double spinconstrain::SpinConstrain<std::complex<double>>::cal_alpha_opt(
     std::vector<ModuleBase::Vector3<double>> spin,
@@ -71,7 +158,10 @@ double spinconstrain::SpinConstrain<std::complex<double>>::cal_alpha_opt(
     const double alpha_trial)
 {
     int nat = this->get_nat();
+    const bool print = false;
     const double zero = 0.0;
+
+    // Mask to only constrained components
     std::vector<ModuleBase::Vector3<double>> spin_mask(nat, 0.0);
     std::vector<ModuleBase::Vector3<double>> target_spin_mask(nat, 0.0);
     std::vector<ModuleBase::Vector3<double>> spin_plus_mask(nat, 0.0);
@@ -81,21 +171,77 @@ double spinconstrain::SpinConstrain<std::complex<double>>::cal_alpha_opt(
     where_fill_scalar_else_2d(this->constrain_, 0, zero, spin, spin_mask);
     where_fill_scalar_else_2d(this->constrain_, 0, zero, spin_plus, spin_plus_mask);
 
+    // Compute dot products for linear interpolation
     for (int ia = 0; ia < nat; ia++)
     {
         for (int ic = 0; ic < 3; ic++)
         {
+            // sum_k: (target - current) . (trial - current)
             temp_1[ia][ic]
                 = (target_spin_mask[ia][ic] - spin_mask[ia][ic]) * (spin_plus_mask[ia][ic] - spin_mask[ia][ic]);
+            // sum_k2: |current - trial|^2
             temp_2[ia][ic] = std::pow(spin_mask[ia][ic] - spin_plus_mask[ia][ic], 2);
         }
     }
     double sum_k = sum_2d(temp_1);
     double sum_k2 = sum_2d(temp_2);
+
+    // Debug output (controlled by print flag)
+    for(int ia=0; ia<std::min(2,(int)nat); ++ia) {
+        if (print) {
+        printf("[ALPHA-OPT] nat=%d sum_k=%.6e sum_k2=%.6e alpha_trial=%.6e\n", nat, sum_k, sum_k2, alpha_trial);
+        printf("[ALPHA-OPT] spin[%d]=(%.4f,%.4f,%.4f) spin_plus[%d]=(%.4f,%.4f,%.4f)\n",
+                ia, spin[ia].x, spin[ia].y, spin[ia].z,
+                ia, spin_plus[ia].x, spin_plus[ia].y, spin_plus[ia].z);
+        }
+    }
+
+    // Guard against division by zero
+    if (std::abs(sum_k2) < 1e-30) {
+        if (print) {
+        printf("[ALPHA-OPT] WARNING: sum_k2 too small, returning alpha_trial\n");
+        }
+        fflush(stdout);
+        return alpha_trial;
+    }
+    fflush(stdout);
     return sum_k * alpha_trial / sum_k2;
 }
 
-/// check gradient decay
+/**
+ * @brief Check if the magnetic susceptibility gradient dM/dlambda has decayed below threshold.
+ *
+ * @par Algorithm
+ * 1. Compute spin_change = new_spin - spin (change in magnetic moments)
+ * 2. Compute nu_change = delta_lambda - dnu_last_step (change in lambda)
+ * 3. Compute full gradient matrix: dM[ia][ic]/dlambda[ja][jc] = spin_change[ia][ic] / nu_change[ja][jc]
+ * 4. Extract diagonal: dM[ia][ic]/dlambda[ia][ic] (self-susceptibility)
+ * 5. Find max diagonal gradient per atom type
+ * 6. If max_gradient[itype] < decay_grad[itype], return true (early termination)
+ *
+ * @par Physical meaning
+ * The diagonal gradient dM/dlambda represents how sensitive the magnetic moment
+ * is to changes in the Lagrange multiplier. When this gradient becomes very small,
+ * further increases in lambda produce diminishing returns in Mi, indicating that
+ * the optimization has reached its practical limit.
+ *
+ * @par Output (when triggered)
+ * "Reach limitation of current step ( maximum gradient < X uB^2/eV in atom type Y ), exit."
+ *
+ * @par Debug output [GRAD-DECAY]
+ * - WARNING: nu_change too small: indicates delta_lambda and dnu_last_step are
+ *   nearly identical, meaning the optimizer is not making progress. This can happen
+ *   if alpha_trial has become very small or if the search direction is nearly zero.
+ *   Solution: check that alpha_trial is not vanishing; increase sc_thr if target
+ *   is physically unreachable.
+ *
+ * @param new_spin Mi at current lambda
+ * @param spin Mi at previous lambda
+ * @param delta_lambda Current lambda change
+ * @param dnu_last_step Previous cumulative step
+ * @param print Whether to print detailed gradient info
+ * @return true if gradient decayed below threshold (should terminate), false otherwise
+ */
 template <>
 bool spinconstrain::SpinConstrain<std::complex<double>>::check_gradient_decay(
     std::vector<ModuleBase::Vector3<double>> new_spin,
@@ -108,21 +254,30 @@ bool spinconstrain::SpinConstrain<std::complex<double>>::check_gradient_decay(
     const double zero = 0.0;
     int nat = this->get_nat();
     int ntype = this->get_ntype();
+
+    // Change in magnetic moments and lambda
     std::vector<ModuleBase::Vector3<double>> spin_change(nat, 0.0);
     std::vector<ModuleBase::Vector3<double>> nu_change(nat, 1.0);
+
+    // Full gradient matrix: dM[ia][ic]/dlambda[ja][jc]
     std::vector<std::vector<std::vector<std::vector<double>>>> spin_nu_gradient(
         nat,
         std::vector<std::vector<std::vector<double>>>(
             3,
             std::vector<std::vector<double>>(nat, std::vector<double>(3, 0.0))));
+    // Diagonal gradient: dM[ia][ic]/dlambda[ia][ic] (self-susceptibility)
     std::vector<ModuleBase::Vector3<double>> spin_nu_gradient_diag(nat, 0.0);
     std::vector<std::pair<int, int>> max_gradient_index(ntype, std::make_pair(0, 0));
     std::vector<double> max_gradient(ntype, 0.0);
+
     subtract_2d(new_spin, spin, spin_change);
     subtract_2d(delta_lambda, dnu_last_step, nu_change);
+
+    // Mask unconstrained components
     where_fill_scalar_2d(this->constrain_, 0, zero, spin_change);
     where_fill_scalar_2d(this->constrain_, 0, one, nu_change);
-    // calculate spin_nu_gradient
+
+    // Calculate full gradient matrix
     for (int ia = 0; ia < nat; ia++)
     {
         for (int ic = 0; ic < 3; ic++)
@@ -131,11 +286,20 @@ bool spinconstrain::SpinConstrain<std::complex<double>>::check_gradient_decay(
             {
                 for (int jc = 0; jc < 3; jc++)
                 {
+                    if (std::abs(nu_change[ja][jc]) < 1e-30) {
+                        printf("[GRAD-DECAY] WARNING: nu_change[%d][%d] too small! delta_lambda=(%.6e,%.6e,%.6e) dnu_last=(%.6e,%.6e,%.6e)\n",
+                               ja, jc, delta_lambda[ja].x, delta_lambda[ja].y, delta_lambda[ja].z,
+                               dnu_last_step[ja].x, dnu_last_step[ja].y, dnu_last_step[ja].z);
+                        fflush(stdout);
+                        nu_change[ja][jc] = 1e-30;
+                    }
                     spin_nu_gradient[ia][ic][ja][jc] = spin_change[ia][ic] / nu_change[ja][jc];
                 }
             }
         }
     }
+
+    // Extract diagonal gradient and find max per atom type
     for (const auto& sc_elem: this->get_atomCounts())
     {
         int it = sc_elem.first;
@@ -155,6 +319,7 @@ bool spinconstrain::SpinConstrain<std::complex<double>>::check_gradient_decay(
             }
         }
     }
+
     if (print)
     {
         print_2d("diagonal gradient: ", spin_nu_gradient_diag, this->nspin_);
@@ -170,6 +335,8 @@ bool spinconstrain::SpinConstrain<std::complex<double>>::check_gradient_decay(
             std::cout << max_gradient[it]/ModuleBase::Ry_to_eV << std::endl;
         }
     }
+
+    // Check if any atom type's gradient has decayed below threshold
     for (int it = 0; it < ntype; it++)
     {
         if (this->decay_grad_[it] > 0 && std::abs(max_gradient[it]) < this->decay_grad_[it])
@@ -180,4 +347,4 @@ bool spinconstrain::SpinConstrain<std::complex<double>>::check_gradient_decay(
         }
     }
     return false;
-}
\ No newline at end of file
+}
diff --git a/source/source_lcao/module_deltaspin/lambda_strategy_integration.cpp b/source/source_lcao/module_deltaspin/lambda_strategy_integration.cpp
new file mode 100644
index 00000000000..2ce2cba0f3a
--- /dev/null
+++ b/source/source_lcao/module_deltaspin/lambda_strategy_integration.cpp
@@ -0,0 +1,109 @@
+#include "spin_constrain.h"
+
+#include "lambda_update_strategies.h"
+
+/**
+ * @file lambda_strategy_integration.cpp
+ * @brief Integration of alternative lambda strategies into SpinConstrain.
+ *
+ * @par Status: INCOMPLETE
+ * This file references members (strategy_type_, strategy_) that are NOT
+ * declared in spin_constrain.h. The code will not compile as-is.
+ * To enable, add the following to spin_constrain.h private section:
+ *
+ *   enum class LambdaStrategyType { BFGS, LinearResponse, AugmentedLagrangian, HybridDelayed };
+ *   LambdaStrategyType strategy_type_ = LambdaStrategyType::BFGS;
+ *   std::unique_ptr<LambdaUpdateStrategy> strategy_;
+ *
+ * And add the file to CMakeLists.txt.
+ *
+ * @par Purpose
+ * Bridges the alternative strategy implementations (lambda_update_strategies.h)
+ * to the SpinConstrain class. Allows runtime selection of lambda update algorithm.
+ */
+
+namespace spinconstrain
+{
+
+/**
+ * @brief Set the lambda update strategy type.
+ *
+ * @details Creates the appropriate strategy object based on the enum value.
+ * For BFGS (default), sets strategy_ = nullptr (uses hard-coded lambda_loop.cpp).
+ *
+ * @param type Strategy type to use
+ */
+template <typename TK>
+void SpinConstrain<TK>::set_strategy_type(LambdaStrategyType type)
+{
+    strategy_type_ = type;
+    switch(type)
+    {
+        case LambdaStrategyType::BFGS:
+            strategy_ = nullptr; // Use hard-coded BFGS in lambda_loop.cpp
+            break;
+        case LambdaStrategyType::LinearResponse:
+            strategy_ = std::unique_ptr<LambdaUpdateStrategy>(
+                new LinearResponseUpdate());
+            break;
+        case LambdaStrategyType::AugmentedLagrangian:
+            strategy_ = std::unique_ptr<LambdaUpdateStrategy>(
+                new AugmentedLagrangianUpdate());
+            break;
+        case LambdaStrategyType::HybridDelayed:
+            strategy_ = std::unique_ptr<LambdaUpdateStrategy>(
+                new HybridDelayedUpdate());
+            break;
+        default:
+            strategy_ = nullptr;
+            strategy_type_ = LambdaStrategyType::BFGS;
+            break;
+    }
+}
+
+/**
+ * @brief Configure parameters for the active strategy.
+ *
+ * @param mu_init Initial penalty parameter (AugmentedLagrangian, HybridDelayed)
+ * @param mu_max Maximum penalty parameter
+ * @param mu_growth Penalty growth factor
+ * @param mix_beta Mixing parameter (LinearResponse)
+ * @param sc_scf_thr SCF charge convergence threshold (HybridDelayed)
+ */
+template <typename TK>
+void SpinConstrain<TK>::set_strategy_params(double mu_init, double mu_max,
+                                             double mu_growth, double mix_beta,
+                                             double sc_scf_thr)
+{
+    if (!strategy_) return; // BFGS uses hard-coded parameters
+
+    if (strategy_type_ == LambdaStrategyType::LinearResponse)
+    {
+        if (auto* lr = dynamic_cast<LinearResponseUpdate*>(strategy_.get()))
+        {
+            // mix_beta is the primary tunable parameter for LinearResponse
+            // chi_min, chi_max, lambda_max keep defaults
+            *lr = LinearResponseUpdate(0.01, 100.0, mix_beta, 10.0);
+        }
+    }
+    else if (strategy_type_ == LambdaStrategyType::AugmentedLagrangian)
+    {
+        if (auto* al = dynamic_cast<AugmentedLagrangianUpdate*>(strategy_.get()))
+        {
+            *al = AugmentedLagrangianUpdate(mu_init, mu_max, mu_growth, 5, 10.0);
+        }
+    }
+    else if (strategy_type_ == LambdaStrategyType::HybridDelayed)
+    {
+        if (auto* hd = dynamic_cast<HybridDelayedUpdate*>(strategy_.get()))
+        {
+            *hd = HybridDelayedUpdate(sc_scf_thr, mu_init, mu_max, mu_growth, 5, 10, 10.0);
+        }
+    }
+}
+
+// Explicit template instantiation
+template class SpinConstrain<std::complex<double>>;
+template class SpinConstrain<double>;
+
+} // namespace spinconstrain
diff --git a/source/source_lcao/module_deltaspin/lambda_update_strategies.cpp b/source/source_lcao/module_deltaspin/lambda_update_strategies.cpp
new file mode 100644
index 00000000000..db98626dd6f
--- /dev/null
+++ b/source/source_lcao/module_deltaspin/lambda_update_strategies.cpp
@@ -0,0 +1,406 @@
+#include "lambda_update_strategies.h"
+#include <sstream>
+#include <cstring>
+
+/**
+ * @file lambda_update_strategies.cpp
+ * @brief Implementation of alternative lambda update strategies.
+ *
+ * @par Note
+ * These strategies are NOT compiled into the library (not in CMakeLists.txt).
+ * They are provided for future development.
+ */
+
+namespace spinconstrain
+{
+
+// ===================================================================
+// Helper functions
+// ===================================================================
+
+/**
+ * @brief Compute RMS error of |Mi - M_target| over constrained components.
+ */
+double compute_rms_error(const std::vector<ModuleBase::Vector3<double>>& Mi,
+                         const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                         const std::vector<ModuleBase::Vector3<int>>& constrain,
+                         int nat)
+{
+    double sum = 0.0;
+    int n_count = 0;
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] != 0)
+            {
+                double diff = Mi[ia][ic] - target_mag[ia][ic];
+                sum += diff * diff;
+                ++n_count;
+            }
+        }
+    }
+    if (n_count == 0) return 0.0;
+    return std::sqrt(sum / n_count);
+}
+
+/**
+ * @brief Count number of constrained components within convergence threshold.
+ */
+int count_converged(const std::vector<ModuleBase::Vector3<double>>& Mi,
+                    const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                    const std::vector<ModuleBase::Vector3<int>>& constrain,
+                    double sc_thr,
+                    int nat)
+{
+    int count = 0;
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] != 0)
+            {
+                double diff = Mi[ia][ic] - target_mag[ia][ic];
+                if (std::abs(diff) < sc_thr)
+                {
+                    ++count;
+                }
+            }
+        }
+    }
+    return count;
+}
+
+/**
+ * @brief Clip lambda values to [-lambda_max, +lambda_max] for constrained components.
+ */
+void cap_lambda(std::vector<ModuleBase::Vector3<double>>& lambda,
+                const std::vector<ModuleBase::Vector3<int>>& constrain,
+                double lambda_max,
+                int nat)
+{
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] != 0)
+            {
+                if (lambda[ia][ic] > lambda_max) lambda[ia][ic] = lambda_max;
+                if (lambda[ia][ic] < -lambda_max) lambda[ia][ic] = -lambda_max;
+            }
+        }
+    }
+}
+
+// ===================================================================
+// Scheme B: Linear Response (One-Step) Update
+// ===================================================================
+
+LinearResponseUpdate::LinearResponseUpdate(double chi_min,
+                                           double chi_max,
+                                           double mix_beta,
+                                           double lambda_max)
+    : chi_min_(chi_min), chi_max_(chi_max), mix_beta_(mix_beta),
+      lambda_max_(lambda_max), converged_(false), last_rms_(1e30)
+{
+}
+
+LambdaUpdateResult LinearResponseUpdate::update_lambda(
+    std::vector<ModuleBase::Vector3<double>>& lambda,
+    const std::vector<ModuleBase::Vector3<double>>& Mi,
+    const std::vector<ModuleBase::Vector3<double>>& target_mag,
+    const std::vector<ModuleBase::Vector3<int>>& constrain,
+    double sc_thr,
+    int iter,
+    int nat)
+{
+    LambdaUpdateResult result;
+    result.n_atoms = nat;
+
+    // Initialize response matrix if needed
+    if (static_cast<int>(chi_.size()) != nat)
+    {
+        chi_.assign(nat, ModuleBase::Vector3<double>(1.0, 1.0, 1.0));
+    }
+
+    // Estimate chi = dM/dlambda from history (finite difference)
+    if (iter >= 2 && static_cast<int>(Mi_history_.size()) >= 2)
+    {
+        const std::vector<ModuleBase::Vector3<double>>& Mi_old = Mi_history_[Mi_history_.size() - 2];
+        const std::vector<ModuleBase::Vector3<double>>& lambda_old = lambda_history_[lambda_history_.size() - 2];
+        for (int ia = 0; ia < nat; ++ia)
+        {
+            for (int ic = 0; ic < 3; ++ic)
+            {
+                if (constrain[ia][ic] == 0) continue;
+                double dlambda = lambda[ia][ic] - lambda_old[ia][ic];
+                double dM = Mi[ia][ic] - Mi_old[ia][ic];
+                if (std::abs(dlambda) > 1e-8)
+                {
+                    double chi_new = dM / dlambda;
+                    // Clamp chi to valid range
+                    if (chi_new > chi_min_ && chi_new < chi_max_)
+                    {
+                        chi_[ia][ic] = chi_new;
+                    }
+                }
+            }
+        }
+    }
+
+    // Update lambda: lambda += mix_beta * (M_target - M) / chi
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] == 0) continue;
+            double residual = target_mag[ia][ic] - Mi[ia][ic];
+            double delta = residual / chi_[ia][ic];
+            lambda[ia][ic] += mix_beta_ * delta;
+        }
+    }
+
+    // Cap lambda to prevent divergence
+    cap_lambda(lambda, constrain, lambda_max_, nat);
+
+    // Save history (keep last 5 entries)
+    Mi_history_.push_back(Mi);
+    lambda_history_.push_back(lambda);
+    if (static_cast<int>(Mi_history_.size()) > 5)
+    {
+        Mi_history_.erase(Mi_history_.begin());
+        lambda_history_.erase(Mi_history_.begin());
+    }
+
+    // Compute result
+    result.rms_error = compute_rms_error(Mi, target_mag, constrain, nat);
+    result.n_converged = count_converged(Mi, target_mag, constrain, sc_thr, nat);
+
+    double max_l = 0.0;
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] != 0)
+            {
+                max_l = std::max(max_l, std::abs(lambda[ia][ic]));
+            }
+        }
+    }
+    result.max_lambda = max_l;
+
+    converged_ = (result.rms_error < sc_thr);
+    result.status = converged_ ? "converged" : "updating";
+
+    return result;
+}
+
+// ===================================================================
+// Scheme C: Augmented Lagrangian Update
+// ===================================================================
+
+AugmentedLagrangianUpdate::AugmentedLagrangianUpdate(double mu_init,
+                                                      double mu_max,
+                                                      double mu_growth,
+                                                      int mu_update_interval,
+                                                      double lambda_max)
+    : mu_(mu_init), mu_init_(mu_init), mu_max_(mu_max),
+      mu_growth_(mu_growth), mu_update_interval_(mu_update_interval),
+      lambda_max_(lambda_max), converged_(false), last_iter_(0)
+{
+}
+
+LambdaUpdateResult AugmentedLagrangianUpdate::update_lambda(
+    std::vector<ModuleBase::Vector3<double>>& lambda,
+    const std::vector<ModuleBase::Vector3<double>>& Mi,
+    const std::vector<ModuleBase::Vector3<double>>& target_mag,
+    const std::vector<ModuleBase::Vector3<int>>& constrain,
+    double sc_thr,
+    int iter,
+    int nat)
+{
+    LambdaUpdateResult result;
+    result.n_atoms = nat;
+    last_iter_ = iter;
+
+    // Dual variable update: lambda += mu * (M - M_target)
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] == 0) continue;
+            double violation = Mi[ia][ic] - target_mag[ia][ic];
+            lambda[ia][ic] += mu_ * violation;
+        }
+    }
+
+    // Cap lambda
+    cap_lambda(lambda, constrain, lambda_max_, nat);
+
+    // Grow mu periodically to enforce constraint more strongly
+    if (iter > 0 && iter % mu_update_interval_ == 0)
+    {
+        mu_ = std::min(mu_max_, mu_ * mu_growth_);
+    }
+
+    // Compute result
+    result.rms_error = compute_rms_error(Mi, target_mag, constrain, nat);
+    result.n_converged = count_converged(Mi, target_mag, constrain, sc_thr, nat);
+
+    double max_l = 0.0;
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] != 0)
+            {
+                max_l = std::max(max_l, std::abs(lambda[ia][ic]));
+            }
+        }
+    }
+    result.max_lambda = max_l;
+
+    converged_ = (result.rms_error < sc_thr);
+    result.status = converged_ ? "converged" : "updating";
+
+    return result;
+}
+
+// ===================================================================
+// Scheme D: Hybrid Delayed Update
+// ===================================================================
+
+HybridDelayedUpdate::HybridDelayedUpdate(double sc_scf_thr,
+                                          double mu_init,
+                                          double mu_max,
+                                          double mu_growth,
+                                          int mu_update_interval,
+                                          int max_inner_steps,
+                                          double lambda_max)
+    : sc_scf_thr_(sc_scf_thr), drho_(1e30), mu_(mu_init), mu_init_(mu_init),
+      mu_max_(mu_max), mu_growth_(mu_growth),
+      mu_update_interval_(mu_update_interval),
+      max_inner_steps_(max_inner_steps), lambda_max_(lambda_max),
+      converged_(false), inner_steps_(0), phase_("early")
+{
+}
+
+LambdaUpdateResult HybridDelayedUpdate::update_lambda(
+    std::vector<ModuleBase::Vector3<double>>& lambda,
+    const std::vector<ModuleBase::Vector3<double>>& Mi,
+    const std::vector<ModuleBase::Vector3<double>>& target_mag,
+    const std::vector<ModuleBase::Vector3<int>>& constrain,
+    double sc_thr,
+    int iter,
+    int nat)
+{
+    LambdaUpdateResult result;
+    result.n_atoms = nat;
+
+    // =============================================================
+    // PHASE DECISION based on charge density convergence (drho_)
+    // =============================================================
+    if (drho_ > sc_scf_thr_ * 100)
+    {
+        // Early phase: charge density changing rapidly, skip lambda update
+        phase_ = "early";
+        result.rms_error = compute_rms_error(Mi, target_mag, constrain, nat);
+        result.n_converged = 0;
+        result.max_lambda = 0.0;
+        for (int ia = 0; ia < nat; ++ia)
+        {
+            for (int ic = 0; ic < 3; ++ic)
+            {
+                if (constrain[ia][ic] != 0)
+                {
+                    result.max_lambda = std::max(result.max_lambda, std::abs(lambda[ia][ic]));
+                }
+            }
+        }
+        converged_ = (result.rms_error < sc_thr);
+        result.status = "skipped_early";
+        return result;
+    }
+    else if (drho_ > sc_scf_thr_)
+    {
+        // Mid phase: charge density stabilizing, lightweight augmented Lagrangian
+        phase_ = "mid";
+        for (int ia = 0; ia < nat; ++ia)
+        {
+            for (int ic = 0; ic < 3; ++ic)
+            {
+                if (constrain[ia][ic] == 0) continue;
+                double violation = Mi[ia][ic] - target_mag[ia][ic];
+                lambda[ia][ic] += mu_ * violation;
+            }
+        }
+        cap_lambda(lambda, constrain, lambda_max_, nat);
+
+        if (iter > 0 && iter % mu_update_interval_ == 0)
+        {
+            mu_ = std::min(mu_max_, mu_ * mu_growth_);
+        }
+    }
+    else
+    {
+        // Late phase: charge density converged, full augmented Lagrangian
+        phase_ = "late";
+        for (int ia = 0; ia < nat; ++ia)
+        {
+            for (int ic = 0; ic < 3; ++ic)
+            {
+                if (constrain[ia][ic] == 0) continue;
+                double violation = Mi[ia][ic] - target_mag[ia][ic];
+                lambda[ia][ic] += mu_ * violation;
+            }
+        }
+        cap_lambda(lambda, constrain, lambda_max_, nat);
+
+        if (iter > 0 && iter % mu_update_interval_ == 0)
+        {
+            mu_ = std::min(mu_max_, mu_ * mu_growth_);
+        }
+
+        // Check if fallback to inner loop is needed (RMS still too large)
+        double rms = compute_rms_error(Mi, target_mag, constrain, nat);
+        if (rms > sc_thr * 10 && inner_steps_ < max_inner_steps_)
+        {
+            result.status = "fallback_triggered";
+            inner_steps_++;
+        }
+    }
+
+    // Compute result
+    result.rms_error = compute_rms_error(Mi, target_mag, constrain, nat);
+    result.n_converged = count_converged(Mi, target_mag, constrain, sc_thr, nat);
+
+    double max_l = 0.0;
+    for (int ia = 0; ia < nat; ++ia)
+    {
+        for (int ic = 0; ic < 3; ++ic)
+        {
+            if (constrain[ia][ic] != 0)
+            {
+                max_l = std::max(max_l, std::abs(lambda[ia][ic]));
+            }
+        }
+    }
+    result.max_lambda = max_l;
+
+    converged_ = (result.rms_error < sc_thr);
+    if (result.status != "fallback_triggered")
+    {
+        if (converged_)
+        {
+            result.status = "converged";
+        }
+        else
+        {
+            result.status = std::string("updating_") + phase_;
+        }
+    }
+
+    return result;
+}
+
+} // namespace spinconstrain
diff --git a/source/source_lcao/module_deltaspin/lambda_update_strategies.h b/source/source_lcao/module_deltaspin/lambda_update_strategies.h
new file mode 100644
index 00000000000..a43e6805367
--- /dev/null
+++ b/source/source_lcao/module_deltaspin/lambda_update_strategies.h
@@ -0,0 +1,285 @@
+#ifndef LAMBDA_UPDATE_STRATEGIES_H
+#define LAMBDA_UPDATE_STRATEGIES_H
+
+#include <vector>
+#include <string>
+#include <cmath>
+#include <algorithm>
+#include <limits>
+
+#include "source_base/vector3.h"
+
+/**
+ * @file lambda_update_strategies.h
+ * @brief Alternative lambda update strategies (NOT currently used in production).
+ *
+ * @par Status
+ * These strategies are experimental and NOT compiled into the library
+ * (not listed in CMakeLists.txt). The production code uses the hard-coded
+ * BFGS-like optimizer in lambda_loop.cpp. These strategies are provided
+ * for future development and experimentation.
+ *
+ * @par Available strategies
+ * - BFGS (default): Hard-coded in lambda_loop.cpp, uses conjugate gradient
+ * - LinearResponse (Scheme B): Estimates susceptibility chi = dM/dlambda
+ * - AugmentedLagrangian (Scheme C): Dual ascent with penalty method
+ * - HybridDelayed (Scheme D): Three-phase approach based on charge convergence
+ *
+ * @par Integration
+ * To use these strategies, the following members need to be added to
+ * SpinConstrain (in spin_constrain.h):
+ *   LambdaStrategyType strategy_type_;
+ *   std::unique_ptr<LambdaUpdateStrategy> strategy_;
+ * And set_strategy_type()/set_strategy_params() need to be called from
+ * init_sc() or the ESolver layer.
+ */
+
+namespace spinconstrain
+
+/**
+ * @brief Result struct for lambda update operations.
+ */
+struct LambdaUpdateResult
+{
+    int n_atoms;
+    double rms_error;            ///< RMS of |M - M_target| after update
+    double max_lambda;           ///< max |lambda| across all atoms/components
+    int n_converged;             ///< number of (atom, component) pairs converged
+    std::string status;          ///< "converged", "updating", "fallback_triggered"
+};
+
+/**
+ * @brief Pure abstract base class for lambda update strategies.
+ *
+ * @par Design pattern
+ * Strategy pattern: different update algorithms can be swapped at runtime
+ * by creating a concrete subclass and passing it to SpinConstrain.
+ */
+class LambdaUpdateStrategy
+{
+  public:
+    virtual ~LambdaUpdateStrategy() = default;
+
+    /**
+     * @brief Update lambda values based on current magnetic moments.
+     *
+     * @param lambda Current Lagrange multipliers (modified in place)
+     * @param Mi Current magnetic moments
+     * @param target_mag Target magnetic moments
+     * @param constrain Constraint flags (0=free, 1=constrained)
+     * @param sc_thr Convergence threshold
+     * @param iter Current iteration number
+     * @param nat Number of atoms
+     * @return Result struct with convergence info
+     */
+    virtual LambdaUpdateResult update_lambda(std::vector<ModuleBase::Vector3<double>>& lambda,
+                                             const std::vector<ModuleBase::Vector3<double>>& Mi,
+                                             const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                                             const std::vector<ModuleBase::Vector3<int>>& constrain,
+                                             double sc_thr,
+                                             int iter,
+                                             int nat) = 0;
+
+    virtual std::string name() const = 0;
+    virtual bool is_converged() const = 0;
+};
+
+/**
+ * @brief Compute RMS error of |M - M_target| (respecting constrain flags).
+ */
+double compute_rms_error(const std::vector<ModuleBase::Vector3<double>>& Mi,
+                         const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                         const std::vector<ModuleBase::Vector3<int>>& constrain,
+                         int nat);
+
+/**
+ * @brief Count converged components (|Mi - M_target| < sc_thr).
+ */
+int count_converged(const std::vector<ModuleBase::Vector3<double>>& Mi,
+                    const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                    const std::vector<ModuleBase::Vector3<int>>& constrain,
+                    double sc_thr,
+                    int nat);
+
+/**
+ * @brief Apply absolute cap to lambda values to prevent divergence.
+ */
+void cap_lambda(std::vector<ModuleBase::Vector3<double>>& lambda,
+                const std::vector<ModuleBase::Vector3<int>>& constrain,
+                double lambda_max,
+                int nat);
+
+// ===================================================================
+// Scheme B: Linear Response (One-Step) Update
+// ===================================================================
+
+/**
+ * @brief Linear response lambda update: lambda += mix_beta * (M_target - M) / chi.
+ *
+ * @par Algorithm
+ * Estimates the magnetic susceptibility chi = dM/dlambda from the history
+ * of the last 2 iterations:
+ *   chi = (Mi_current - Mi_previous) / (lambda_current - lambda_previous)
+ * Then updates:
+ *   lambda += mix_beta * (M_target - Mi) / chi
+ *
+ * @par Parameters
+ * - chi_min: minimum susceptibility (prevents division by small numbers)
+ * - chi_max: maximum susceptibility (prevents unstable updates)
+ * - mix_beta: mixing parameter (0.3 = conservative, 1.0 = aggressive)
+ * - lambda_max: absolute cap on lambda values
+ */
+class LinearResponseUpdate : public LambdaUpdateStrategy
+{
+  public:
+    LinearResponseUpdate(double chi_min = 0.01,
+                         double chi_max = 100.0,
+                         double mix_beta = 0.3,
+                         double lambda_max = 10.0);
+
+    LambdaUpdateResult update_lambda(std::vector<ModuleBase::Vector3<double>>& lambda,
+                                     const std::vector<ModuleBase::Vector3<double>>& Mi,
+                                     const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                                     const std::vector<ModuleBase::Vector3<int>>& constrain,
+                                     double sc_thr,
+                                     int iter,
+                                     int nat) override;
+
+    std::string name() const override { return "LinearResponse"; }
+    bool is_converged() const override { return converged_; }
+
+    const std::vector<ModuleBase::Vector3<double>>& get_chi() const { return chi_; }
+
+  private:
+    double chi_min_;
+    double chi_max_;
+    double mix_beta_;
+    double lambda_max_;
+    bool converged_;
+    double last_rms_;
+    std::vector<ModuleBase::Vector3<double>> chi_; ///< Estimated susceptibility dM/dlambda
+    std::vector<std::vector<ModuleBase::Vector3<double>>> Mi_history_; ///< Last 5 Mi values
+    std::vector<std::vector<ModuleBase::Vector3<double>>> lambda_history_; ///< Last 5 lambda values
+};
+
+// ===================================================================
+// Scheme C: Augmented Lagrangian Update
+// ===================================================================
+
+/**
+ * @brief Augmented Lagrangian lambda update: lambda += mu * (M - M_target).
+ *
+ * @par Algorithm
+ * Simple dual ascent method. The penalty parameter mu grows periodically
+ * to enforce the constraint more strongly:
+ *   lambda += mu * (Mi - M_target)
+ *   mu *= mu_growth every mu_update_interval iterations
+ *
+ * @par Parameters
+ * - mu_init: initial penalty parameter
+ * - mu_max: maximum penalty (prevents numerical instability)
+ * - mu_growth: growth factor (1.5 = moderate)
+ * - mu_update_interval: iterations between mu updates
+ * - lambda_max: absolute cap on lambda values
+ */
+class AugmentedLagrangianUpdate : public LambdaUpdateStrategy
+{
+  public:
+    AugmentedLagrangianUpdate(double mu_init = 0.1,
+                              double mu_max = 10.0,
+                              double mu_growth = 1.5,
+                              int mu_update_interval = 5,
+                              double lambda_max = 10.0);
+
+    LambdaUpdateResult update_lambda(std::vector<ModuleBase::Vector3<double>>& lambda,
+                                     const std::vector<ModuleBase::Vector3<double>>& Mi,
+                                     const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                                     const std::vector<ModuleBase::Vector3<int>>& constrain,
+                                     double sc_thr,
+                                     int iter,
+                                     int nat) override;
+
+    std::string name() const override { return "AugmentedLagrangian"; }
+    bool is_converged() const override { return converged_; }
+
+    double get_mu() const { return mu_; }
+    void reset_mu() { mu_ = mu_init_; }
+
+  private:
+    double mu_; ///< Current penalty parameter
+    double mu_init_;
+    double mu_max_;
+    double mu_growth_;
+    int mu_update_interval_;
+    double lambda_max_;
+    bool converged_;
+    int last_iter_;
+};
+
+// ===================================================================
+// Scheme D: Hybrid Delayed Update
+// ===================================================================
+
+/**
+ * @brief Hybrid delayed update: three-phase approach based on charge convergence.
+ *
+ * @par Algorithm
+ * Phase decision based on drho (charge density change):
+ * - Early phase (drho > sc_scf_thr * 100): Skip lambda update entirely.
+ *   The charge density is changing too rapidly for lambda optimization.
+ * - Mid phase (sc_scf_thr < drho < sc_scf_thr * 100): Lightweight augmented
+ *   Lagrangian update with small mu.
+ * - Late phase (drho < sc_scf_thr): Full augmented Lagrangian with fallback
+ *   to inner loop if RMS error is still large.
+ *
+ * @par Parameters
+ * - sc_scf_thr: SCF charge convergence threshold (phase decision boundary)
+ * - mu_init, mu_max, mu_growth: Augmented Lagrangian parameters
+ * - max_inner_steps: maximum fallback inner loop iterations
+ * - lambda_max: absolute cap on lambda values
+ */
+class HybridDelayedUpdate : public LambdaUpdateStrategy
+{
+  public:
+    HybridDelayedUpdate(double sc_scf_thr = 1e-3,
+                        double mu_init = 0.1,
+                        double mu_max = 10.0,
+                        double mu_growth = 1.5,
+                        int mu_update_interval = 5,
+                        int max_inner_steps = 10,
+                        double lambda_max = 10.0);
+
+    void set_drho(double drho) { drho_ = drho; }
+
+    LambdaUpdateResult update_lambda(std::vector<ModuleBase::Vector3<double>>& lambda,
+                                     const std::vector<ModuleBase::Vector3<double>>& Mi,
+                                     const std::vector<ModuleBase::Vector3<double>>& target_mag,
+                                     const std::vector<ModuleBase::Vector3<int>>& constrain,
+                                     double sc_thr,
+                                     int iter,
+                                     int nat) override;
+
+    std::string name() const override { return "HybridDelayed"; }
+    bool is_converged() const override { return converged_; }
+
+    std::string get_phase() const { return phase_; }
+    void reset() { mu_ = mu_init_; inner_steps_ = 0; phase_ = "early"; }
+
+  private:
+    double sc_scf_thr_;
+    double drho_; ///< Current charge density change
+    double mu_;
+    double mu_init_;
+    double mu_max_;
+    double mu_growth_;
+    int mu_update_interval_;
+    int max_inner_steps_;
+    double lambda_max_;
+    bool converged_;
+    int inner_steps_; ///< Count of fallback inner loop iterations
+    std::string phase_; ///< Current phase: "early", "mid", "late"
+};
+
+} // namespace spinconstrain
+
+#endif // LAMBDA_UPDATE_STRATEGIES_H
diff --git a/source/source_lcao/module_deltaspin/sc_parse_json.cpp b/source/source_lcao/module_deltaspin/sc_parse_json.cpp
new file mode 100644
index 00000000000..37f23fa3973
--- /dev/null
+++ b/source/source_lcao/module_deltaspin/sc_parse_json.cpp
@@ -0,0 +1,4 @@
+#include "spin_constrain.h"
+
+template class spinconstrain::SpinConstrain<std::complex<double>>;
+template class spinconstrain::SpinConstrain<double>;
diff --git a/source/source_lcao/module_deltaspin/spin_constrain.cpp b/source/source_lcao/module_deltaspin/spin_constrain.cpp
index 6b898f34f6e..a8c3c262445 100644
--- a/source/source_lcao/module_deltaspin/spin_constrain.cpp
+++ b/source/source_lcao/module_deltaspin/spin_constrain.cpp
@@ -8,6 +8,13 @@
 namespace spinconstrain
 {
 
+/**
+ * @brief Singleton instance accessor.
+ *
+ * @details Uses Meyers' Singleton pattern (local static variable).
+ * Guaranteed thread-safe initialization in C++11 and later.
+ * Each template instantiation (complex<double>, double) gets its own instance.
+ */
 template <typename TK>
 SpinConstrain<TK>& SpinConstrain<TK>::getScInstance()
 {
@@ -15,6 +22,25 @@ SpinConstrain<TK>& SpinConstrain<TK>::getScInstance()
     return instance;
 }
 
+/**
+ * @brief Calculate the spin constraint energy: E_scon = -sum_i (lambda_i . Mi_i).
+ *
+ * @details The constraint energy is the Lagrange multiplier term in the
+ * constrained DFT functional:
+ *   E'[rho] = E_DFT[rho] - sum_i lambda_i . (Mi_i - M_target_i)
+ *
+ * IMPORTANT: Returns 0.0 if magnetic moments are NOT yet converged.
+ * This is because the constraint energy is only physically meaningful
+ * when Mi ≈ M_target. Before convergence, the lambda values are still
+ * adjusting and the energy would be misleading.
+ *
+ * @par Output meaning
+ * - E_scon < 0: lambda and Mi are aligned (system resists the constraint)
+ * - E_scon > 0: lambda and Mi are anti-aligned (constraint assists the system)
+ * - E_scon = 0: not converged OR all lambda = 0 (no constraint needed)
+ *
+ * @return Constraint energy in Ry (0.0 if not converged)
+ */
 template <typename TK>
 double SpinConstrain<TK>::cal_escon()
 {
@@ -72,6 +98,164 @@ int SpinConstrain<TK>::get_nspin() const
     return this->nspin_;
 }
 
+template <typename TK>
+void SpinConstrain<TK>::set_npol(int npol)
+{
+    this->npol_ = npol;
+}
+
+template <typename TK>
+int SpinConstrain<TK>::get_npol() const
+{
+    return this->npol_;
+}
+
+/**
+ * @brief Get spin sign for k-point: determines whether this k-point is
+ * spin-up (+1) or spin-down (-1) in collinear (nspin=2) calculations.
+ *
+ * @details In collinear spin, the wavefunction is split into two k-point pools:
+ * - isk[ik] == 0: spin-up channel (majority spin) -> sign = +1
+ * - isk[ik] == 1: spin-down channel (minority spin) -> sign = -1
+ * For non-collinear (npol=2), always returns +1 since both components
+ * are handled together.
+ *
+ * @return +1 for spin-up, -1 for spin-down, +1 for non-collinear
+ */
+template <typename TK>
+int SpinConstrain<TK>::get_spin_sign(int ik) const
+{
+    if (this->npol_ == 2) return 1;
+    // npol == 1 (nspin == 2): isk[ik]==0 => spin-up (+1), isk[ik]==1 => spin-down (-1)
+    return (this->pelec->klist->isk[ik] == 0) ? 1 : -1;
+}
+
+/**
+ * @brief Accumulate magnetic moments from projector coefficients (becp) for one k-point.
+ *
+ * @par Algorithm (npol=2, non-collinear):
+ * For each atom, compute the 2x2 occupation matrix from becp coefficients:
+ *   occ[0] = sum_ih becp_up^*(ih) * becp_up(ih)   = <psi_up|P_at|psi_up>
+ *   occ[1] = sum_ih becp_up^*(ih) * becp_dn(ih)   = <psi_up|P_at|psi_dn>
+ *   occ[2] = sum_ih becp_dn^*(ih) * becp_up(ih)   = <psi_dn|P_at|psi_up>
+ *   occ[3] = sum_ih becp_dn^*(ih) * becp_dn(ih)   = <psi_dn|P_at|psi_dn>
+ * where P_at = sum_{l,m} |alpha_{l,m}><alpha_{l,m}| is the atomic projector.
+ *
+ * The magnetic moment is extracted via Pauli matrix traces:
+ *   Mx = Re(occ[1] + occ[2]), My = Im(occ[1] - occ[2]), Mz = Re(occ[0] - occ[3])
+ *
+ * @par Algorithm (npol=1, collinear):
+ * Only the z-component (spin projection) is computed:
+ *   occ = sum_ih |becp(ih)|^2 = <psi|P_at|psi>
+ *   Mz += weight * occ * spin_sign
+ * where spin_sign = +1 for spin-up, -1 for spin-down.
+ *
+ * @param becp Projector coefficients, layout: [ib * npol * nkb + spin * nkb + ih]
+ * @param nkb Total number of projectors across all atoms
+ * @param nbands Number of bands (occupied + unoccupied in the subspace)
+ * @param npol Number of spinor components (1 for collinear, 2 for non-collinear)
+ * @param ik K-point index (used for spin_sign lookup in collinear mode)
+ * @param wg_ik Band occupation weights for this k-point (from Fermi-Dirac)
+ * @param nh_iat Array of projector counts per atom: nh_iat[iat] = nproj for atom iat
+ */
+template <typename TK>
+void SpinConstrain<TK>::accumulate_Mi_from_becp(const std::complex<double>* becp,
+                                                  int nkb,
+                                                  int nbands,
+                                                  int npol,
+                                                  int ik,
+                                                  const double* wg_ik,
+                                                  const int* nh_iat)
+{
+    if (npol == 2)
+    {
+        for (int ib = 0; ib < nbands; ib++)
+        {
+            const double weight = wg_ik[ib];
+            int begin_ih = 0;
+            for (int iat = 0; iat < static_cast<int>(this->Mi_.size()); iat++)
+            {
+                std::complex<double> occ[4] = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO};
+                const int nh = nh_iat[iat];
+                for (int ih = 0; ih < nh; ih++)
+                {
+                    const int index = ib * 2 * nkb + begin_ih + ih;
+                    occ[0] += conj(becp[index]) * becp[index];
+                    occ[1] += conj(becp[index]) * becp[index + nkb];
+                    occ[2] += conj(becp[index + nkb]) * becp[index];
+                    occ[3] += conj(becp[index + nkb]) * becp[index + nkb];
+                }
+                this->Mi_[iat] += pauli_to_moment(occ, weight);
+                begin_ih += nh;
+            }
+        }
+    }
+    else // npol == 1
+    {
+        const int sign = this->get_spin_sign(ik);
+        for (int ib = 0; ib < nbands; ib++)
+        {
+            const double weight = wg_ik[ib];
+            int begin_ih = 0;
+            for (int iat = 0; iat < static_cast<int>(this->Mi_.size()); iat++)
+            {
+                double occ = 0.0;
+                const int nh = nh_iat[iat];
+                for (int ih = 0; ih < nh; ih++)
+                {
+                    const int index = ib * nkb + begin_ih + ih;
+                    occ += (conj(becp[index]) * becp[index]).real();
+                }
+                this->Mi_[iat].z += weight * occ * sign;
+                begin_ih += nh;
+            }
+        }
+    }
+}
+
+template <typename TK>
+int SpinConstrain<TK>::get_nw() const
+{
+    int nw = 0;
+    for (const auto& pair : this->orbitalCounts)
+    {
+        nw += pair.second;
+    }
+    return nw;
+}
+
+/**
+ * @brief Convert (itype, local_atom_index, orbital_index) to global orbital index.
+ *
+ * @details The global orbital index is used to access elements in distributed
+ * matrices (ScaLAPACK format). The mapping is:
+ *   iwt = sum_{t < itype} orbitalCounts[t]  +  iat * orbitalCounts[itype]  +  orbital_index
+ * where iat = get_iat(itype, local_atom_index).
+ *
+ * @return Global orbital index, or 0 if itype not found
+ */
+template <typename TK>
+int SpinConstrain<TK>::get_iwt(int itype, int iat, int orbital_index) const
+{
+    auto it1 = this->orbitalCounts.find(itype);
+    if (it1 == this->orbitalCounts.end())
+    {
+        return 0;
+    }
+    int offset = 0;
+    for (auto it = this->orbitalCounts.begin(); it != it1; ++it)
+    {
+        offset += it->second;
+    }
+    auto it2 = this->atomCounts.find(itype);
+    if (it2 == this->atomCounts.end())
+    {
+        return offset;
+    }
+    return offset + iat * it1->second + orbital_index;
+}
+
+/// @brief Get total number of atoms across all element types
 template <typename TK>
 int SpinConstrain<TK>::get_nat()
 {
@@ -83,12 +267,25 @@ int SpinConstrain<TK>::get_nat()
     return nat;
 }
 
+/// @brief Get number of element types
 template <typename TK>
 int SpinConstrain<TK>::get_ntype()
 {
     return this->atomCounts.size();
 }
 
+/**
+ * @brief Validate atom count data integrity.
+ *
+ * @details Checks that atomCounts has been properly initialized and contains
+ * valid data. Called before any operation that depends on atom indexing.
+ *
+ * @par Error conditions
+ * - "atomCounts is not set": init_sc() was not called
+ * - "nat <= 0": no atoms in the system
+ * - "itype out of range": element type index exceeds ntype
+ * - "number of atoms <= 0": some element type has no atoms
+ */
 template <typename TK>
 void SpinConstrain<TK>::check_atomCounts()
 {
@@ -115,7 +312,24 @@ void SpinConstrain<TK>::check_atomCounts()
     }
 }
 
-// get iat
+/**
+ * @brief Convert (element_type, local_atom_index) to global atom index.
+ *
+ * @details Atoms in ABACUS are organized by element type. Within each type,
+ * atoms are indexed locally (0, 1, ..., nat_itype-1). This function maps
+ * to the global index that runs across all atoms (0, 1, ..., nat-1).
+ *
+ * Example: If type 0 has 2 Fe atoms and type 1 has 3 O atoms:
+ *   get_iat(0, 0) -> 0 (Fe_0)
+ *   get_iat(0, 1) -> 1 (Fe_1)
+ *   get_iat(1, 0) -> 2 (O_0)
+ *   get_iat(1, 1) -> 3 (O_1)
+ *   get_iat(1, 2) -> 4 (O_2)
+ *
+ * @param itype Element type index (0 to ntype-1)
+ * @param atom_index Local index within the element type
+ * @return Global atom index
+ */
 template <typename TK>
 int SpinConstrain<TK>::get_iat(int itype, int atom_index)
 {
@@ -170,7 +384,8 @@ const std::map<int, std::map<int, int>>& SpinConstrain<TK>::get_lnchiCounts() co
     return this->lnchiCounts;
 }
 
-// set sc_lambda from ScData
+// set sc_lambda from ScData (parsed from STRU file)
+// ScData is organized by element type; this function flattens it to per-atom arrays
 template <typename TK>
 void SpinConstrain<TK>::set_sc_lambda()
 {
@@ -193,7 +408,20 @@ void SpinConstrain<TK>::set_sc_lambda()
     }
 }
 
-// set target_mag from ScData
+/**
+ * @brief Set target magnetic moments from ScData (parsed from STRU file).
+ *
+ * @details Supports two specification modes:
+ * - mag_type=0: Direct Cartesian (mx, my, mz) in uB
+ * - mag_type=1: Spherical (|M|, theta, phi) converted to Cartesian:
+ *   Mx = |M| * sin(theta) * cos(phi)
+ *   My = |M| * sin(theta) * sin(phi)
+ *   Mz = |M| * cos(theta)
+ *   Angles are in degrees and converted to radians.
+ *
+ * Near-zero components (< 1e-14) are explicitly set to 0.0 to avoid
+ * floating-point noise in constraint checks.
+ */
 template <typename TK>
 void SpinConstrain<TK>::set_target_mag()
 {
@@ -233,7 +461,19 @@ void SpinConstrain<TK>::set_target_mag()
     }
 }
 
-// set constrain from ScData
+/**
+ * @brief Set constraint flags from ScData.
+ *
+ * @details The constrain array determines which components of each atom's
+ * magnetic moment are actively constrained:
+ * - constrain[ia].x = 1: Mx is constrained to target_mag[ia].x
+ * - constrain[ia].y = 1: My is constrained to target_mag[ia].y
+ * - constrain[ia].z = 1: Mz is constrained to target_mag[ia].z
+ * - constrain[ia].c = 0: component is free (determined by the system)
+ *
+ * Default is all zeros (no constraints). Components with constrain=0
+ * are excluded from the lambda optimization and RMS error calculation.
+ */
 template <typename TK>
 void SpinConstrain<TK>::set_constrain()
 {
@@ -310,7 +550,7 @@ void SpinConstrain<TK>::set_target_mag(const std::vector<ModuleBase::Vector3<dou
         for (int iat = 0; iat < nat; iat++)
         {
             this->target_mag_[iat].z
-                = target_mag_in[iat].x; /// this is wired because the UnitCell class set in x direction
+                = target_mag_in[iat].z;
         }
     }
     else if (this->nspin_ == 4)
@@ -359,7 +599,7 @@ const std::vector<ModuleBase::Vector3<int>>& SpinConstrain<TK>::get_constrain()
     return this->constrain_;
 }
 
-/// zero atomic magnetic moment
+/// @brief Reset all atomic magnetic moments to zero. Called before each Mi calculation.
 template <typename TK>
 void SpinConstrain<TK>::zero_Mi()
 {
@@ -510,7 +750,20 @@ void SpinConstrain<TK>::set_ParaV(Parallel_Orbitals* ParaV_in)
     }
 }
 
-/// print Mi
+/**
+ * @brief Print magnetic moments per atom in formatted table.
+ *
+ * @par Output format
+ * - nspin=2: "ATOM   1    2.0000000000" (z-component only)
+ * - nspin=4: "ATOM   1    0.0010000000    0.0020000000    1.9990000000" (x, y, z)
+ *
+ * @par Interpretation
+ * - Positive Mi.z: spin aligned with z-axis (spin-up character)
+ * - Negative Mi.z: spin anti-aligned with z-axis (spin-down character)
+ * - Non-zero Mi.x/Mi.y: non-collinear spin components
+ * - Mi close to target_mag: constraint is well-satisfied
+ * - Mi far from target_mag: constraint is not yet converged
+ */
 template <typename TK>
 void SpinConstrain<TK>::print_Mi(std::ofstream& ofs_running)
 {
@@ -555,7 +808,25 @@ void SpinConstrain<TK>::print_Mi(std::ofstream& ofs_running)
     }
 }
 
-/// print magnetic force (defined as \frac{\delta{L}}/{\delta{Mi}} = -lambda[iat])
+/**
+ * @brief Print the magnetic force (-lambda) per atom in eV/uB.
+ *
+ * @par Physical meaning
+ * The "magnetic force" is the derivative of the constrained Lagrangian
+ * with respect to the magnetic moment: dL/dMi = -lambda_i.
+ * It represents how much energy would change if the constraint were relaxed.
+ *
+ * @par Interpretation
+ * - Large |lambda|: The system strongly resists the target moment constraint
+ * - lambda ≈ 0: The system naturally has the target moment (no constraint needed)
+ * - Positive lambda.z: The constraint pushes the moment in the +z direction
+ * - Negative lambda.z: The constraint pushes the moment in the -z direction
+ *
+ * @par Typical values
+ * - Well-converged SCF: lambda ~ 0.01-1 eV/uB
+ * - Strongly constrained: lambda ~ 1-10 eV/uB
+ * - Diverging SCF: lambda growing without bound (check target_mag合理性)
+ */
 template <typename TK>
 void SpinConstrain<TK>::print_Mag_Force(std::ofstream& ofs_running)
 {
@@ -600,6 +871,46 @@ void SpinConstrain<TK>::print_Mag_Force(std::ofstream& ofs_running)
     }
 }
 
+/**
+ * @brief Reset DeltaSpin operator initialization state.
+ *
+ * @details The DeltaSpin operator caches internal state (projector matrices, etc.)
+ * from a previous SCF iteration. When the constraint parameters change (e.g., new
+ * target moments or lambda values), the cached state may be invalid. This function
+ * forces the operator to reinitialize on the next call.
+ *
+ * @par When to call
+ * - After changing target_mag_ or constrain_ arrays
+ * - When restarting from a previous SCF calculation with different constraints
+ * - When switching between LCAO and PW basis sets
+ */
+template <typename TK>
+void SpinConstrain<TK>::reset_dspin_operator()
+{
+#ifdef __LCAO
+    if (this->p_operator == nullptr)
+    {
+        return;
+    }
+    if (this->nspin_ == 4)
+    {
+        auto* dspin = dynamic_cast<hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>*>(this->p_operator);
+        if (dspin)
+        {
+            dspin->reset_initialized();
+        }
+    }
+    else if (this->nspin_ == 2)
+    {
+        auto* dspin = dynamic_cast<hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, double>>*>(this->p_operator);
+        if (dspin)
+        {
+            dspin->reset_initialized();
+        }
+    }
+#endif
+}
+
 template class SpinConstrain<std::complex<double>>;
 template class SpinConstrain<double>;
 
diff --git a/source/source_lcao/module_deltaspin/spin_constrain.h b/source/source_lcao/module_deltaspin/spin_constrain.h
index 224af123fe4..320371e2ef8 100644
--- a/source/source_lcao/module_deltaspin/spin_constrain.h
+++ b/source/source_lcao/module_deltaspin/spin_constrain.h
@@ -1,10 +1,48 @@
+/**
+ * @file spin_constrain.h
+ * @brief Core header for the DeltaSpin (spin-constrained DFT) module.
+ *
+ * @par Purpose
+ * Implements constrained local spin density (CLSD) functional calculations,
+ * where atomic magnetic moments are constrained to target values via
+ * Lagrange multipliers (lambda). The constrained energy functional is:
+ *   E'[rho] = E[rho] - sum_i lambda_i . (M_i - M_target_i)
+ * where lambda_i is the Lagrange multiplier (magnetic force) on atom i,
+ * M_i is the computed magnetic moment, and M_target_i is the target moment.
+ *
+ * @par Algorithm
+ * The lambda optimization uses a conjugate-gradient-like scheme (run_lambda_loop):
+ *   1. Compute magnetic moments Mi from current wavefunction
+ *   2. Calculate residual: delta_spin = Mi - M_target
+ *   3. Build search direction (steepest descent or Polak-Ribiere CG)
+ *   4. Apply lambda update: lambda += alpha * search_direction
+ *   5. Re-diagonalize Hamiltonian with DeltaSpin correction
+ *   6. Compute new Mi, find optimal alpha via linear interpolation
+ *   7. Repeat until RMS(delta_spin) < sc_thr
+ *
+ * @par Basis Set Support
+ * - LCAO: Uses real-space projection via DeltaSpin operator on density matrix
+ * - PW (Plane Wave): Uses subspace diagonalization with OnsiteProjector becp coefficients
+ *
+ * @par Spin Types
+ * - nspin=2 (collinear): Only z-component constrained, npol=1, uses spin_sign (+1/-1)
+ * - nspin=4 (non-collinear): Full xyz components constrained, npol=2, full Pauli matrices
+ *
+ * @par Convergence Criteria
+ * - RMS error: sqrt(mean(delta_spin^2)) < sc_thr (adaptive threshold)
+ * - Gradient decay: max(dM/dlambda) per atom type < decay_grad[itype]
+ * - Maximum steps: nsc (default 50), minimum steps: nsc_min
+ */
 #ifndef SPIN_CONSTRAIN_H
 #define SPIN_CONSTRAIN_H
 
+#include <complex>
 #include <map>
 #include <vector>
 
 #include "source_base/constants.h"
+#include "source_base/complexmatrix.h"
+#include "source_base/matrix.h"
 #include "source_base/tool_quit.h"
 #include "source_base/tool_title.h"
 #include "source_base/vector3.h"
@@ -21,16 +59,104 @@
 namespace spinconstrain
 {
 
+/**
+ * @brief Convert spinor occupation matrix to magnetic moment vector using Pauli matrices.
+ *
+ * @details For a two-component spinor wavefunction, the spin density matrix is:
+ *   rho = |a|^2    a*b  |   = | (1+Mz)/2    (Mx+iMy)/2 |
+ *         |b*a    |b|^2  |     | (Mx-iMy)/2   (1-Mz)/2  |
+ * The magnetic moment components are extracted via Pauli matrix traces:
+ *   Mx = Tr(rho * sigma_x) = occ[1] + occ[2]           (real part)
+ *   My = Tr(rho * sigma_y) = Im(occ[1] - occ[2])        (imaginary part)
+ *   Mz = Tr(rho * sigma_z) = occ[0] - occ[3]            (real part)
+ * where occ = {|a|^2, a*b, b*a, |b|^2} from becp coefficients.
+ *
+ * @param occ 4-element array of occupation matrix elements (complex)
+ * @param weight k-point weight for integration
+ * @return 3D magnetic moment vector (Mx, My, Mz) in Bohr magnetons
+ */
+inline ModuleBase::Vector3<double> pauli_to_moment(const std::complex<double> occ[4], double weight)
+{
+    return ModuleBase::Vector3<double>(
+        weight * (occ[1] + occ[2]).real(),
+        weight * (occ[1] - occ[2]).imag(),
+        weight * (occ[0] - occ[3]).real()
+    );
+}
+
 struct ScAtomData;
 
+/**
+ * @brief Singleton class implementing spin-constrained DFT (DeltaSpin).
+ *
+ * @par Template parameter TK
+ * - std::complex<double>: Used for nspin=4 (non-collinear) and internally for nspin=2
+ * - double: Stub specialization for nspin=2 collinear (all methods are no-ops)
+ *
+ * @par Design rationale
+ * - Singleton pattern: Only one SpinConstrain instance per TK type is needed,
+ *   shared across the SCF loop. Prevents duplicate state management.
+ * - void* pointers (p_hamilt, psi): Type-erased to avoid template dependency cycles
+ *   with the Hamiltonian and Psi classes. Cast to concrete types at call sites.
+ * - subspace data caching (sub_h_save, sub_s_save, becp_save): For PW basis, the
+ *   subspace Hamiltonian and becp are computed once per SCF iteration and reused
+ *   across multiple lambda steps, avoiding expensive re-computation.
+ *
+ * @par Key workflow (PW basis):
+ *   SCF iteration -> run_lambda_loop()
+ *     -> cal_mw_from_lambda() [first call saves subspace data]
+ *       -> calculate_delta_hcc() [H += becp^† * lambda * becp]
+ *       -> diag_responce() [subspace diagonalization, update becp]
+ *       -> accumulate_Mi_from_becp() [compute magnetic moments]
+ *     -> BFGS optimizer updates lambda
+ *     -> Repeat until RMS(Mi - M_target) < sc_thr
+ *     -> update_psi_charge() [final full-space update]
+ *
+ * @par Error handling
+ * - assert(sub_h_save != nullptr): Called before subspace operations;
+ *   failure means cal_mw_from_lambda() was not called before update_psi_charge().
+ *   Solution: Ensure cal_mw_from_lambda() is called at least once per SCF step.
+ * - "atomCounts is not set": init_sc() was not called or UnitCell data is missing.
+ * - "nspin must be 2 or 4": Invalid spin configuration. nspin=1 is not supported.
+ */
 template <typename TK>
 class SpinConstrain
 {
 public:
     /**
-     * pubic interface for spin-constrained DFT
-    */
-    /// initialize spin-constrained DFT
+     * =============================================================
+     * PUBLIC INTERFACE - Main entry points for the ESolver layer
+     * =============================================================
+     */
+
+    /**
+     * @brief Master initialization: populate all SC parameters from UnitCell and input.
+     *
+     * @details Called once at the start of a DeltaSpin calculation. Performs:
+     *   1. Set input parameters (convergence threshold, max steps, trial alpha)
+     *   2. Get atom/orbital/lnchi counts from UnitCell for indexing
+     *   3. Set nspin and npol (nspin=4 -> npol=2, nspin=2 -> npol=1)
+     *   4. Load target_mag, lambda, constrain from UnitCell (parsed from STRU)
+     *   5. For nspin=2: force x,y constraint flags to 0 (collinear: only z is constrained)
+     *   6. Set solver parameters (k-point list, Hamiltonian, psi, electronic state)
+     *
+     * @param sc_thr_in Convergence threshold for RMS(Mi - M_target) in uB
+     * @param nsc_in Maximum number of inner lambda optimization steps
+     * @param nsc_min_in Minimum number of inner steps before early exit checks
+     * @param alpha_trial_in Initial trial step size (eV/uB^2), converted to Ry internally
+     * @param sccut_in Maximum lambda change per step (eV/uB), converted to Ry internally
+     * @param sc_drop_thr_in Fraction of initial RMS for adaptive threshold
+     * @param ucell Unit cell with atomic positions, STRU constraint data
+     * @param direction_only_in If true, only optimize spin direction (|lambda| -> 0)
+     * @param ParaV_in Parallel orbitals distribution info (LCAO only)
+     * @param nspin_in Spin type: 2=collinear, 4=non-collinear
+     * @param kv_in K-point vector list
+     * @param p_hamilt_in Pointer to Hamiltonian (HamiltLCAO or HamiltPW)
+     * @param psi_in Pointer to wavefunctions (Psi<TK>)
+     * @param dm_in Pointer to density matrix (LCAO only)
+     * @param pelec_in Pointer to electronic state (for charge, weights, ekb)
+     * @param pw_wfc_in PW basis for wavefunction storage (PW only)
+     */
   void init_sc(double sc_thr_in,
                int nsc_in,
                int nsc_min_in,
@@ -38,6 +164,7 @@ class SpinConstrain
                double sccut_in,
                double sc_drop_thr_in,
                const UnitCell& ucell,
+               bool direction_only_in,
                Parallel_Orbitals* ParaV_in,
                int nspin_in,
                const K_Vectors& kv_in,
@@ -49,84 +176,230 @@ class SpinConstrain
 			   elecstate::ElecState* pelec_in,
                ModulePW::PW_Basis_K* pw_wfc_in = nullptr);
 
-  /// @brief calculate the magnetization of each atom with real space projection method for LCAO base
-  /// @param step : the step number of the SCF calculation
-  /// @param print : print the magnetization of each atom if true
+  /**
+   * @brief Calculate atomic magnetic moments using real-space projection (LCAO basis).
+   *
+   * @details Uses the DeltaSpin operator to compute magnetic moments from the density
+   * matrix. For nspin=2, extracts only the z-component. For nspin=4, extracts
+   * all three components from the interleaved 4-component spinor density matrix.
+   * The moments are stored in Mi_ (indexed by global atom index iat).
+   *
+   * @param step Current SCF iteration number (for logging)
+   * @param print Whether to print moments to ofs_running
+   */
   void cal_mi_lcao(const int& step, bool print = false);
 
+  /**
+   * @brief Calculate atomic magnetic moments using projector overlap (PW basis).
+   *
+   * @details For each k-point:
+   *   1. Call OnsiteProjector::tabulate_atomic() to set up atomic projectors
+   *   2. Call OnsiteProjector::overlap_proj_psi() to compute becp = <alpha|psi>
+   *   3. Call accumulate_Mi_from_becp() to decompose becp into magnetic moments
+   * Finally, sum Mi across all MPI k-pool ranks via Parallel_Reduce.
+   */
   void cal_mi_pw();
 
+  /**
+   * @brief Core workflow: apply lambda -> solve Hamiltonian -> compute magnetic moments.
+   *
+   * @details This is the central function called repeatedly during lambda optimization.
+   *
+   * @par LCAO path:
+   *   1. Update lambda in DeltaSpin operator
+   *   2. Solve HSolverLCAO (diagonalize without charge update)
+   *   3. Calculate weights from eigenvalues
+   *   4. Call cal_mi_lcao() to compute moments
+   *
+   * @par PW path:
+   *   1. [First call only] Save subspace H, S, and becp from Hamiltonian
+   *   2. Apply DeltaSpin correction via calculate_delta_hcc()
+   *   3. Diagonalize in subspace via diag_responce(), update becp
+   *   4. Calculate weights from new eigenvalues
+   *   5. Call accumulate_Mi_from_becp() for each k-point
+   *   6. MPI reduce Mi across k-pools
+   *
+   * @param i_step Current inner lambda step (-1 = initialization, 0+ = optimization)
+   * @param delta_lambda Change in lambda from previous step (for incremental H correction)
+   */
   void cal_mw_from_lambda(int i_step, 
 		  const ModuleBase::Vector3<double>* delta_lambda = nullptr);
 
   /**
-   * @brief calculate the energy of \sum_i \lambda_i * Mi
-   * if this->is_mag_converged is true, then this function will calculate the energy and return the real value
-   * if this->is_mag_converged is false, then this function will return 0.0
+   * @brief Calculate the spin constraint energy contribution: E_scon = -sum(lambda_i . Mi_i).
+   *
+   * @details Returns 0.0 if magnetic moments are not yet converged, because the
+   * constraint energy is only physically meaningful when Mi ≈ M_target.
+   * This energy is added to the total DFT energy in the SCF loop.
+   *
+   * @return Constraint energy in Ry (0.0 if not converged)
    */
   double cal_escon();
 
+  /// @brief Get the cached constraint energy from the last cal_escon() call (Ry)
   double get_escon() const;
 
-  void run_lambda_loop(int outer_step, 
+  /**
+   * @brief Main lambda optimization loop using conjugate-gradient-like scheme.
+   *
+   * @details Iteratively adjusts Lagrange multipliers (lambda) to drive atomic
+   * magnetic moments (Mi) toward target values. Uses:
+   * - Polak-Ribiere formula for beta (conjugate direction)
+   * - Linear interpolation for optimal step size (alpha_opt)
+   * - Adaptive alpha_trial adjustment based on convergence behavior
+   * - Gradient decay check for early termination
+   *
+   * @param outer_step Current SCF outer iteration number
+   * @param rerun If true, use full PW solver for final charge update
+   */
+  void run_lambda_loop(int outer_step,
 		  bool rerun = true);
 
-  /// @brief update the charge density for LCAO base with new lambda
-  /// update the charge density and psi for PW base with new lambda
-  void update_psi_charge(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve = true);
+  /**
+   * @brief Alternative mode: sweep lambda values linearly for energy landscape mapping.
+   *
+   * @details Used for debugging or plotting E(lambda) curves. Scans from
+   * sc_scan_lambda_start to sc_scan_lambda_end in sc_scan_steps steps.
+   * Results written to lambda_scan_results.dat.
+   *
+   * @param outer_step Current SCF outer iteration number
+   */
+  void run_lambda_linear_scan(int outer_step);
 
-  void calculate_delta_hcc(std::complex<double>* h_tmp, 
-		  const std::complex<double>* becp_k, 
-		  const ModuleBase::Vector3<double>* delta_lambda, 
-		  const int nbands, const int nkb, const int* nh_iat);
+  /// @brief Reset DeltaSpin operator initialization state when constraints change
+  void reset_dspin_operator();
 
-  /// lambda loop helper functions
+  /**
+   * @brief Update wavefunctions and charge density after lambda optimization.
+   *
+   * @details Dispatcher to LCAO or PW (CPU/GPU) update paths.
+   * For PW: performs subspace diagonalization + optional full-space refinement.
+   *
+   * @param delta_lambda Lambda change for incremental H correction
+   * @param pw_solve If true, run full PW solver for refinement; if false, just update weights
+   * @param full_update If true, apply full lambda (not delta) to H correction
+   */
+  void update_psi_charge(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve = true, bool full_update = false);
+
+  /**
+   * @brief Wavefunction and charge density update implementation for PW basis.
+   * @details Two-stage process:
+   *          1. Subspace diagonalization: apply DeltaSpin correction and solve for each k-point
+   *          2. Charge update: full-space diagonalization or direct charge update based on pw_solve
+   */
+  void update_psi_charge_pw(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve, bool full_update = false);
+  
+  /// CPU implementation of PW basis update
+  void update_psi_charge_pw_cpu(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve, bool full_update = false);
+  
+#if ((defined __CUDA) || (defined __ROCM))
+  /// GPU implementation of PW basis update
+  void update_psi_charge_pw_gpu(const ModuleBase::Vector3<double>* delta_lambda, bool pw_solve, bool full_update = false);
+#endif
+
+  /**
+   * @brief Compute DeltaSpin correction to the subspace Hamiltonian.
+   *
+   * @details Adds the constraint term to the Hamiltonian in the subspace:
+   *   H_corrected = H_original + becp^† * delta_lambda * becp
+   * For npol=2 (nspin=4), uses full 2x2 Pauli matrix coefficients:
+   *   coeff = | lambda_z      lambda_x + i*lambda_y |
+   *           | lambda_x - i*lambda_y   -lambda_z   |
+   * For npol=1 (nspin=2), only the z-component with spin_sign.
+   *
+   * @param h_tmp Subspace Hamiltonian (nbands x nbands, in/out)
+   * @param becp_k Projector coefficients for k-point ik
+   * @param delta_lambda Lambda change per atom (or full lambda if full_update)
+   * @param nbands Number of bands
+   * @param nkb Total number of projectors
+   * @param nh_iat Number of projectors per atom
+   * @param ik K-point index
+   * @param full_update If true, compute delta = lambda_current - lambda_at_save
+   */
+  void calculate_delta_hcc(std::complex<double>* h_tmp,
+		  const std::complex<double>* becp_k,
+		  const ModuleBase::Vector3<double>* delta_lambda,
+		  const int nbands, const int nkb, const int* nh_iat, const int ik,
+		  bool full_update = false);
+
+#ifdef __LCAO
+  /// @brief Convert orbital matrix to nested vector format [nspin][iat][iw]
+  std::vector<std::vector<std::vector<double>>> convert(const ModuleBase::matrix& orbMulP);
+  /// @brief Calculate magnetic moment from orbital matrix (LCAO alternative path)
+  void calculate_MW(const std::vector<std::vector<std::vector<double>>>& AorbMulP);
+  /// @brief Collect magnetic moment contributions from complex matrix mu*dm
+  void collect_MW(ModuleBase::matrix& MecMulP,
+                  const ModuleBase::ComplexMatrix& mud,
+                  int nw,
+                  int isk);
+#endif
+
+  /// Lambda loop helper: check if RMS error below threshold or max steps reached
   bool check_rms_stop(int outer_step, int i_step, double rms_error, double duration, double total_duration);
 
-  /// apply restriction
+  /// Lambda loop helper: cap step size via restrict_current_ to prevent overshooting
   void check_restriction(const std::vector<ModuleBase::Vector3<double>>& search, double& alpha_trial);
 
-  /// check gradient decay
+  /**
+   * @brief Lambda loop helper: check if dM/dlambda gradient has decayed below threshold.
+   *
+   * @details Computes the diagonal of the susceptibility matrix dM/dlambda for each
+   * atom type. If max gradient < decay_grad[itype], the lambda optimization has
+   * reached diminishing returns and should stop.
+   *
+   * @return true if gradient decayed below threshold, false otherwise
+   */
   bool check_gradient_decay(std::vector<ModuleBase::Vector3<double>> new_spin,
                             std::vector<ModuleBase::Vector3<double>> old_spin,
                             std::vector<ModuleBase::Vector3<double>> new_delta_lambda,
                             std::vector<ModuleBase::Vector3<double>> old_delta_lambda,
                             bool print = false);
-  /// @brief  calculate alpha_opt
+  /// @brief Lambda loop helper: calculate optimal step size via linear interpolation
   double cal_alpha_opt(std::vector<ModuleBase::Vector3<double>> spin,
                        std::vector<ModuleBase::Vector3<double>> spin_plus,
                        const double alpha_trial);
-  /// print header info
+  /// Print header at start of lambda loop
   void print_header();
-  /// print termination message
+  /// Print termination message with final spin and lambda values
   void print_termination();
 
-  /// print mi
+  /// Print magnetic moments to output stream
   void print_Mi(std::ofstream& ofs_running);
 
-  /// print magnetic force, defined as \frac{\delta{L}}/{\delta{Mi}} = -lambda[iat])
+  /// Print magnetic force (defined as dL/dMi = -lambda[iat]) in eV/uB
   void print_Mag_Force(std::ofstream& ofs_running);
 
-  /// @brief use rerun to get higher precision in lambda_loop for PW base
+  /// @brief Use full PW solver (rerun) for higher precision in lambda loop
   bool higher_mag_prec = false;
 
 public:
     /**
-     * important outter class pointers used in spin-constrained DFT
-    */
+     * =============================================================
+     * EXTERNAL POINTERS - Set by init_sc(), used throughout the module
+     * =============================================================
+     *
+     * @par Design rationale for void* pointers
+     * The Hamiltonian and Psi types differ between LCAO and PW bases.
+     * Using void* avoids template coupling and allows the same SpinConstrain
+     * code to work with both basis sets. Concrete types are recovered
+     * via static_cast at call sites.
+     */
+
+    /// @brief Parallel orbitals distribution (row/col mapping for ScaLAPACK)
     Parallel_Orbitals *ParaV = nullptr;
     //--------------------------------------------------------------------------------
-    // pointers for solve Hamiltonian to get new Magnetization from Lambda
-    void* p_hamilt = nullptr;
-    void* psi = nullptr;
-    elecstate::ElecState* pelec = nullptr;
-    ModulePW::PW_Basis_K* pw_wfc_ = nullptr;
+    // Pointers to external objects: Hamiltonian, wavefunctions, electronic state
+    // These are type-erased void* to avoid coupling with specific Hamilt/Psi types
+    void* p_hamilt = nullptr;     ///< Pointer to HamiltLCAO or HamiltPW
+    void* psi = nullptr;          ///< Pointer to Psi<TK> wavefunction container
+    elecstate::ElecState* pelec = nullptr;  ///< Electronic state: ekb, wg, charge, klist
+    ModulePW::PW_Basis_K* pw_wfc_ = nullptr; ///< PW basis for wavefunction storage (PW only)
 #ifdef __LCAO
-    elecstate::DensityMatrix<TK, double>* dm_;
+    elecstate::DensityMatrix<TK, double>* dm_; ///< Density matrix pointer (LCAO only)
 #endif
-    double tpiba = 0.0; /// save ucell.tpiba
-    const double meV_to_Ry = 7.349864435130999e-05;
-    K_Vectors kv_;
+    double tpiba = 0.0; /// @brief 2*pi/a lattice constant scaling factor, saved from UnitCell
+    const double meV_to_Ry = 7.349864435130999e-05; ///< Conversion factor
+    K_Vectors kv_; ///< K-point vector list
     //--------------------------------------------------------------------------------
 
   public:
@@ -222,68 +495,162 @@ class SpinConstrain
                                elecstate::ElecState* pelec_in);
 
   private:
-    SpinConstrain(){};                               // Private constructor
-    ~SpinConstrain(){};                              // Destructor
-    SpinConstrain& operator=(SpinConstrain const&) = delete;  // Copy assign
-    SpinConstrain& operator=(SpinConstrain &&) = delete;      // Move assign
-    std::map<int, std::vector<ScAtomData>> ScData;
-    std::map<int, double> ScDecayGrad; // in unit of uB^2/eV
-    std::vector<double> decay_grad_;   // in unit of uB^2/Ry
-    std::map<int, int> atomCounts;
-    std::map<int, int> orbitalCounts;
-    std::map<int, std::map<int, int>> lnchiCounts;
-    std::vector<ModuleBase::Vector3<double>> lambda_; // in unit of Ry/uB in code, but in unit of meV/uB in input file
-    std::vector<ModuleBase::Vector3<double>> target_mag_; // in unit of uB
-    std::vector<ModuleBase::Vector3<double>> Mi_; // in unit of uB
-    std::vector<std::string> atomLabels_;
-    double escon_ = 0.0;
-    int nspin_ = 0;
-    int npol_ = 1;
     /**
-     * input parameters for lambda-loop
+     * =============================================================
+     * PRIVATE DATA MEMBERS - Internal state of SpinConstrain
+     * =============================================================
+     *
+     * @par Unit conversion
+     * - lambda_: Ry/uB internally, but meV/uB in input file (STRU)
+     * - target_mag_, Mi_: uB (Bohr magnetons)
+     * - alpha_trial_: Ry/uB^2 internally, but input is eV/uB^2
+     * - restrict_current_: Ry/uB internally, but input is eV/uB
+     * - decay_grad_: uB^2/Ry internally, but uB^2/eV in ScDecayGrad
+     *
+     * @par Indexing
+     * All per-atom arrays (lambda_, target_mag_, Mi_, constrain_) are indexed
+     * by GLOBAL atom index (iat), which runs from 0 to nat-1. The mapping
+     * from (element_type, local_atom_index) to iat is handled by get_iat().
      */
-    int nsc_;
-    int nsc_min_;
-    double sc_drop_thr_ = 1e-3;
-    double sc_thr_; // in unit of uB
-    double current_sc_thr_;
-    std::vector<ModuleBase::Vector3<int>> constrain_;
-    bool debug = false;
-    double alpha_trial_; // in unit of Ry/uB^2 = 0.01 eV/uB^2
-    double restrict_current_; // in unit of Ry/uB = 3 eV/uB
+    SpinConstrain(){};                               ///< Private constructor (Singleton)
+    ~SpinConstrain()
+    {
+        delete[] sub_h_save;
+        delete[] sub_s_save;
+        delete[] becp_save;
+        sub_h_save = nullptr;
+        sub_s_save = nullptr;
+        becp_save = nullptr;
+    };
+    SpinConstrain& operator=(SpinConstrain const&) = delete;  ///< Copy assignment deleted
+    SpinConstrain& operator=(SpinConstrain &&) = delete;      ///< Move assignment deleted
+    std::map<int, std::vector<ScAtomData>> ScData; ///< Raw constraint data indexed by element type (itype)
+    std::map<int, double> ScDecayGrad; ///< Gradient decay thresholds (uB^2/eV) per element type
+    std::vector<double> decay_grad_;   ///< Gradient decay thresholds converted to uB^2/Ry, per element type
+    std::map<int, int> atomCounts;     ///< Number of atoms per element type: {itype -> nat_itype}
+    std::map<int, int> orbitalCounts;  ///< Number of orbitals per element type: {itype -> nw_itype}
+    std::map<int, std::map<int, int>> lnchiCounts; ///< {itype -> {L -> nchi}}: angular momentum channels
+    std::vector<ModuleBase::Vector3<double>> lambda_; ///< Lagrange multipliers (Ry/uB) per atom, 3 components
+    std::vector<ModuleBase::Vector3<double>> target_mag_; ///< Target magnetic moments (uB) per atom
+    std::vector<ModuleBase::Vector3<double>> Mi_; ///< Current computed magnetic moments (uB) per atom
+    std::vector<std::string> atomLabels_; ///< Human-readable labels: "Fe_0", "Fe_1", etc.
+    double escon_ = 0.0; ///< Cached constraint energy from last cal_escon() call (Ry)
+    int nspin_ = 0; ///< Spin type: 2=collinear, 4=non-collinear
+    int npol_ = 1; ///< Number of spinor components: 1 for nspin=2, 2 for nspin=4
+    /**
+     * =============================================================
+     * LAMBDA LOOP INPUT PARAMETERS
+     * =============================================================
+     */
+    int nsc_; ///< Maximum number of inner lambda optimization steps
+    int nsc_min_; ///< Minimum steps before early exit checks (gradient decay)
+    double sc_drop_thr_ = 1e-3; ///< Fraction of initial RMS for adaptive threshold
+    double sc_thr_; ///< Convergence threshold for RMS(Mi - M_target) in uB
+    double current_sc_thr_; ///< Adaptive threshold: max(initial_rms * sc_drop_thr_, sc_thr_)
+    std::vector<ModuleBase::Vector3<int>> constrain_; ///< Per-atom/component constraint flags: 0=free, 1=constrained
+    bool debug = false; ///< Debug flag for verbose output
+    double alpha_trial_; ///< Initial trial step size (Ry/uB^2), adaptively adjusted during loop
+    double restrict_current_; ///< Maximum allowed lambda change per step (Ry/uB), prevents overshooting
+    bool direction_only_ = false; ///< If true, only optimize spin direction (project out parallel lambda component)
 
   public:
-    /// @brief save operator for spin-constrained DFT
-    /// @param op_in the base pointer of operator, actual type should be DeltaSpin<OperatorLCAO<TK, TR>>*
+    /// @brief Set DeltaSpin operator pointer for magnetic moment calculation (LCAO)
+    /// @param op_in Base pointer, actual type is DeltaSpin<OperatorLCAO<TK, TR>>*
     void set_operator(hamilt::Operator<TK>* op_in);
-    /// @brief set is_Mi_converged
+    /// @brief Set magnetic moment convergence flag
     void set_mag_converged(bool is_Mi_converged_in){this->is_Mi_converged = is_Mi_converged_in;}
-    /// @brief get is_Mi_converged
+    /// @brief Get magnetic moment convergence flag
     bool mag_converged() const {return this->is_Mi_converged;}
+    void set_npol(int npol);
+    int get_npol() const;
+    int get_nw() const; ///< Total number of orbitals across all constrained atoms
+    int get_iwt(int itype, int iat, int orbital_index) const; ///< Convert (itype, iat, iw) to global orbital index
+    /// @brief Get spin sign for k-point ik: +1 for spin-up, -1 for spin-down (nspin=2 only)
+    int get_spin_sign(int ik) const;
+    /**
+     * @brief Accumulate magnetic moments from becp coefficients for a single k-point.
+     *
+     * @details For npol=2 (nspin=4), computes full Pauli decomposition:
+     *   occ[0] = sum(becp_up^* * becp_up), occ[1] = sum(becp_up^* * becp_dn),
+     *   occ[2] = sum(becp_dn^* * becp_up), occ[3] = sum(becp_dn^* * becp_dn)
+     *   Mi = pauli_to_moment(occ, weight)
+     * For npol=1 (nspin=2), only z-component:
+     *   occ = sum(|becp|^2), Mi.z += weight * occ * spin_sign
+     *
+     * @param becp Projector coefficients <alpha_{l,m}|psi_{k,i}>
+     * @param nkb Total number of projectors
+     * @param nbands Number of bands
+     * @param npol Number of spinor components
+     * @param ik K-point index (for spin_sign lookup in nspin=2)
+     * @param wg_ik Band weights for this k-point
+     * @param nh_iat Number of projectors per atom
+     */
+    void accumulate_Mi_from_becp(const std::complex<double>* becp,
+                                 int nkb,
+                                 int nbands,
+                                 int npol,
+                                 int ik,
+                                 const double* wg_ik,
+                                 const int* nh_iat);
   private:
-    /// operator for spin-constrained DFT, used for calculating current atomic magnetic moment
+    /// DeltaSpin operator pointer for LCAO magnetic moment calculation
     hamilt::Operator<TK>* p_operator = nullptr;
-    /// @brief if atomic magnetic moment is converged
+    /// @brief Flag: has the magnetic moment converged in the current SCF iteration?
     bool is_Mi_converged = false;
 
-    TK* sub_h_save = nullptr;
-    TK* sub_s_save = nullptr;
-    TK* becp_save = nullptr;
+    /**
+     * =============================================================
+     * SUBSPACE DATA CACHING (PW basis only)
+     * =============================================================
+     *
+     * @par Purpose
+     * In the PW basis, the subspace Hamiltonian H_sub = <psi|H|psi> and
+     * becp coefficients are expensive to compute. They are cached on the
+     * first call to cal_mw_from_lambda() and reused across multiple lambda
+     * steps within the same SCF iteration.
+     *
+     * @par Layout
+     * - sub_h_save[ik * nbands * nbands + i * nbands + j]: H_sub for k-point ik
+     * - sub_s_save: same layout for overlap matrix S_sub
+     * - becp_save[ik * size_becp + ib * nkb * npol + ip]: becp coefficients
+     * - lambda_in_sub_: lambda values at the time subspace data was saved,
+     *   used to compute delta_lambda for incremental H corrections
+     *
+     * @par Memory management
+     * Allocated with new[] on first cal_mw_from_lambda() call, freed in
+     * update_psi_charge_pw_cpu/gpu() after final subspace diagonalization.
+     */
+    TK* sub_h_save = nullptr;       ///< Cached subspace Hamiltonian for all k-points
+    TK* sub_s_save = nullptr;       ///< Cached subspace overlap matrix for all k-points
+    TK* becp_save = nullptr;        ///< Cached becp coefficients for all k-points
+    std::vector<ModuleBase::Vector3<double>> lambda_in_sub_; ///< Lambda values when subspace was saved
 };
 
 
 /**
- * @brief struct for storing parameters of non-collinear spin-constrained DFT
+ * @brief Per-atom spin constraint parameters parsed from STRU file.
+ *
+ * @details Stores the raw constraint data for a single atom before
+ * it is distributed to the flat arrays (lambda_, target_mag_, constrain_).
+ * The constraint data is organized by element type (itype) in the ScData map.
+ *
+ * @par Target moment specification (mag_type):
+ * - mag_type=0: Direct Cartesian components (mx, my, mz) in uB
+ * - mag_type=1: Spherical coordinates (magnitude, theta, phi)
+ *   - target_mag_val: |M| in uB
+ *   - target_mag_angle1: polar angle theta (degrees) from z-axis
+ *   - target_mag_angle2: azimuthal angle phi (degrees) in xy-plane
+ *   Conversion: Mx = |M|*sin(theta)*cos(phi), My = |M|*sin(theta)*sin(phi), Mz = |M|*cos(theta)
  */
 struct ScAtomData {
-    int index;
-    std::vector<double> lambda;
-    std::vector<double> target_mag;
-    std::vector<int> constrain;
-    int mag_type;
-    double target_mag_val;
-    double target_mag_angle1;
-    double target_mag_angle2;
+    int index;                              ///< Local atom index within its element type
+    std::vector<double> lambda;             ///< Initial lambda values (Ry/uB), 3 components (x,y,z)
+    std::vector<double> target_mag;         ///< Target magnetic moment (uB), 3 components
+    std::vector<int> constrain;             ///< Constraint flags: 0=free, 1=constrained, per component
+    int mag_type;                           ///< 0=Cartesian (mx,my,mz), 1=spherical (|M|,theta,phi)
+    double target_mag_val;                  ///< For mag_type=1: target moment magnitude (uB)
+    double target_mag_angle1;               ///< For mag_type=1: polar angle theta (degrees)
+    double target_mag_angle2;               ///< For mag_type=1: azimuthal angle phi (degrees)
 };
 
 } // namespace spinconstrain
diff --git a/source/source_lcao/module_deltaspin/template_helpers.cpp b/source/source_lcao/module_deltaspin/template_helpers.cpp
index 83e5f17f75e..add9ee6398c 100644
--- a/source/source_lcao/module_deltaspin/template_helpers.cpp
+++ b/source/source_lcao/module_deltaspin/template_helpers.cpp
@@ -1,22 +1,56 @@
 #include "spin_constrain.h"
 
+/**
+ * @file template_helpers.cpp
+ * @brief Stub implementations for the TK=double (nspin=2) template specialization.
+ *
+ * @par Why stubs?
+ * Even for nspin=2 (collinear spin), ABACUS uses complex arithmetic internally
+ * for the Hamiltonian and wavefunctions. The TK=double specialization exists
+ * only to satisfy the linker when the code is compiled with nspin=2.
+ * All actual computation is done by the TK=std::complex<double> specialization.
+ *
+ * @par Design rationale
+ * - The SpinConstrain template is instantiated for both TK=double and TK=complex<double>
+ * - For TK=double, all methods that perform actual computation are no-ops
+ * - Simple getters/setters (nspin, npol, atomCounts, etc.) still work correctly
+ *   because they are not template-specialized (they use the base template)
+ *
+ * @par Methods stubbed
+ * - cal_mw_from_lambda: no-op (computed by complex<double> specialization)
+ * - cal_mi_lcao: no-op (computed by complex<double> specialization)
+ * - run_lambda_loop: no-op (computed by complex<double> specialization)
+ * - check_rms_stop: returns false (continue loop)
+ * - check_restriction: no-op
+ * - cal_alpha_opt: returns 0.0
+ * - print_termination: no-op
+ * - print_header: no-op
+ * - check_gradient_decay: returns false (no early termination)
+ * - run_lambda_linear_scan: no-op
+ * - reset_dspin_operator: no-op
+ */
+
+/// @brief cal_mw_from_lambda stub (TK=double): no-op
 template <>
-void spinconstrain::SpinConstrain<double>::cal_mw_from_lambda(int i_step, 
+void spinconstrain::SpinConstrain<double>::cal_mw_from_lambda(int i_step,
 		const ModuleBase::Vector3<double>* delta_lambda)
 {
 }
 
+/// @brief cal_mi_lcao stub (TK=double): no-op
 template <>
 void spinconstrain::SpinConstrain<double>::cal_mi_lcao(const int& step, bool print)
 {
 }
 
+/// @brief run_lambda_loop stub (TK=double): no-op
 template <>
-void spinconstrain::SpinConstrain<double>::run_lambda_loop(int outer_step, 
+void spinconstrain::SpinConstrain<double>::run_lambda_loop(int outer_step,
 		bool rerun)
 {
 }
 
+/// @brief check_rms_stop stub (TK=double): always return false (continue)
 template <>
 bool spinconstrain::SpinConstrain<double>::check_rms_stop(int outer_step,
                                                                     int i_step,
@@ -27,6 +61,7 @@ bool spinconstrain::SpinConstrain<double>::check_rms_stop(int outer_step,
     return false;
 }
 
+/// @brief check_restriction stub (TK=double): no-op
 template <>
 void spinconstrain::SpinConstrain<double>::check_restriction(
     const std::vector<ModuleBase::Vector3<double>>& search,
@@ -34,7 +69,7 @@ void spinconstrain::SpinConstrain<double>::check_restriction(
 {
 }
 
-/// calculate alpha_opt
+/// @brief cal_alpha_opt stub (TK=double): return 0.0
 template <>
 double spinconstrain::SpinConstrain<double>::cal_alpha_opt(std::vector<ModuleBase::Vector3<double>> spin,
                                                                      std::vector<ModuleBase::Vector3<double>> spin_plus,
@@ -43,16 +78,19 @@ double spinconstrain::SpinConstrain<double>::cal_alpha_opt(std::vector<ModuleBas
     return 0.0;
 }
 
+/// @brief print_termination stub (TK=double): no-op
 template <>
 void spinconstrain::SpinConstrain<double>::print_termination()
 {
 }
 
+/// @brief print_header stub (TK=double): no-op
 template <>
 void spinconstrain::SpinConstrain<double>::print_header()
 {
 }
 
+/// @brief check_gradient_decay stub (TK=double): always return false (no early termination)
 template <>
 bool spinconstrain::SpinConstrain<double>::check_gradient_decay(
     std::vector<ModuleBase::Vector3<double>> new_spin,
@@ -63,3 +101,15 @@ bool spinconstrain::SpinConstrain<double>::check_gradient_decay(
 {
     return false;
 }
+
+/// @brief run_lambda_linear_scan stub (TK=double): no-op
+template <>
+void spinconstrain::SpinConstrain<double>::run_lambda_linear_scan(int outer_step)
+{
+}
+
+/// @brief reset_dspin_operator stub (TK=double): no-op
+template <>
+void spinconstrain::SpinConstrain<double>::reset_dspin_operator()
+{
+}
diff --git a/source/source_lcao/module_deltaspin/test/CMakeLists.txt b/source/source_lcao/module_deltaspin/test/CMakeLists.txt
index 04a21d73d55..6e80e7ce359 100644
--- a/source/source_lcao/module_deltaspin/test/CMakeLists.txt
+++ b/source/source_lcao/module_deltaspin/test/CMakeLists.txt
@@ -22,4 +22,10 @@ AddTest(
     ../spin_constrain.cpp
     ../template_helpers.cpp
 )
-endif() 
+
+AddTest(
+  TARGET deltaspin_pw_test
+  LIBS ${math_libs} base device parameter
+  SOURCES deltaspin_pw_test.cpp
+)
+endif()
diff --git a/source/source_lcao/module_deltaspin/test/deltaspin_pw_test.cpp b/source/source_lcao/module_deltaspin/test/deltaspin_pw_test.cpp
new file mode 100644
index 00000000000..30274d23677
--- /dev/null
+++ b/source/source_lcao/module_deltaspin/test/deltaspin_pw_test.cpp
@@ -0,0 +1,566 @@
+#include "gtest/gtest.h"
+#include <complex>
+#include <cmath>
+#include <vector>
+
+#define private public
+#include "source_io/module_parameter/parameter.h"
+#undef private
+
+/***********************************************************************
+ * Unit tests for DeltaSpin PW support
+ *
+ * Strategy: test the core arithmetic of calculate_delta_hcc and
+ * cal_Mi_pw as pure formulas — no OnsiteProjector or full ABACUS
+ * framework needed.
+ ***********************************************************************/
+
+class DeltaSpinPwTest : public ::testing::Test
+{
+  protected:
+    void SetUp() override {}
+    void TearDown() override {}
+};
+
+// =====================================================================
+// calculate_delta_hcc: ps array construction (npol=2, Pauli matrix)
+// =====================================================================
+
+TEST_F(DeltaSpinPwTest, DeltaHcc_Npol2_SingleAtom)
+{
+    // npol=2: for each (ib, ip):
+    //   ps[becpind]      += coeff0 * becp1 + coeff2 * becp2
+    //   ps[becpind+nkb]  += coeff1 * becp1 + coeff3 * becp2
+    // where coeff0 = (lambda_z, 0), coeff1 = (lambda_x, lambda_y),
+    //       coeff2 = (lambda_x, -lambda_y), coeff3 = (-lambda_z, 0)
+
+    const int nat = 1;
+    const int nproj = 2; // 2 projectors for this atom
+    const int nbands = 1;
+    const int nkb = nproj; // total projectors = nproj for single atom
+    const int npol = 2;
+
+    // delta_lambda for atom 0
+    struct { double x, y, z; } delta_lambda = {0.5, 0.3, 0.8};
+
+    const std::complex<double> coeff0(delta_lambda.z, 0.0);           // (0.8, 0)
+    const std::complex<double> coeff1(delta_lambda.x, delta_lambda.y); // (0.5, 0.3)
+    const std::complex<double> coeff2(delta_lambda.x, -delta_lambda.y);// (0.5, -0.3)
+    const std::complex<double> coeff3(-delta_lambda.z, 0.0);          // (-0.8, 0)
+
+    // becp: layout [ib * npol * nkb + sum + ip] for up, +nkb for down
+    std::vector<std::complex<double>> becp(nbands * npol * nkb, {0.0, 0.0});
+    // band 0, projector 0
+    becp[0 * npol * nkb + 0] = {1.0, 0.2};       // becp_up[0]
+    becp[0 * npol * nkb + 0 + nkb] = {0.3, -0.1}; // becp_dn[0]
+    // band 0, projector 1
+    becp[0 * npol * nkb + 1] = {0.5, 0.0};        // becp_up[1]
+    becp[0 * npol * nkb + 1 + nkb] = {0.0, 0.7};  // becp_dn[1]
+
+    std::vector<std::complex<double>> ps(nbands * npol * nkb, {0.0, 0.0});
+
+    int sum = 0;
+    for(int ib = 0; ib < nbands * npol; ib += npol)
+    {
+        for(int ip = 0; ip < nproj; ip++)
+        {
+            const int becpind = ib * nkb + sum + ip;
+            const std::complex<double> becp1 = becp[becpind];
+            const std::complex<double> becp2 = becp[becpind + nkb];
+            ps[becpind] += coeff0 * becp1 + coeff2 * becp2;
+            ps[becpind + nkb] += coeff1 * becp1 + coeff3 * becp2;
+        }
+    }
+
+    // Verify projector 0:
+    // ps_up[0] = (0.8,0)*(1.0,0.2) + (0.5,-0.3)*(0.3,-0.1)
+    //          = (0.8, 0.16) + (0.15-0.03, -0.05-0.09) = (0.8,0.16) + (0.12,-0.14)
+    //          = (0.92, 0.02)
+    EXPECT_NEAR(ps[0].real(), 0.92, 1e-12);
+    EXPECT_NEAR(ps[0].imag(), 0.02, 1e-12);
+
+    // ps_dn[0] = (0.5,0.3)*(1.0,0.2) + (-0.8,0)*(0.3,-0.1)
+    //          = (0.5-0.06, 0.3+0.1) + (-0.24, 0.08)
+    //          = (0.44, 0.4) + (-0.24, 0.08) = (0.20, 0.48)
+    EXPECT_NEAR(ps[0 + nkb].real(), 0.20, 1e-12);
+    EXPECT_NEAR(ps[0 + nkb].imag(), 0.48, 1e-12);
+}
+
+// PLACEHOLDER_DELTASPIN_PW_TESTS
+
+TEST_F(DeltaSpinPwTest, DeltaHcc_Npol2_MultiAtom)
+{
+    // Two atoms: verify sum offset advances correctly
+    const int nat = 2;
+    const int nproj_0 = 1, nproj_1 = 1;
+    const int nkb = nproj_0 + nproj_1; // 2
+    const int nbands = 1;
+    const int npol = 2;
+
+    struct Vec3 { double x, y, z; };
+    Vec3 delta_lambda[2] = {{1.0, 0.0, 0.0}, {0.0, 0.0, 2.0}};
+
+    std::vector<std::complex<double>> becp(nbands * npol * nkb, {0.0, 0.0});
+    // atom 0, proj 0: becp_up = (1,0), becp_dn = (0,0)
+    becp[0] = {1.0, 0.0};
+    becp[0 + nkb] = {0.0, 0.0};
+    // atom 1, proj 0: becp_up = (0,0), becp_dn = (1,0)
+    becp[1] = {0.0, 0.0};
+    becp[1 + nkb] = {1.0, 0.0};
+
+    std::vector<std::complex<double>> ps(nbands * npol * nkb, {0.0, 0.0});
+    int nh_iat[2] = {nproj_0, nproj_1};
+
+    int sum = 0;
+    for(int iat = 0; iat < nat; iat++)
+    {
+        const std::complex<double> c0(delta_lambda[iat].z, 0.0);
+        const std::complex<double> c1(delta_lambda[iat].x, delta_lambda[iat].y);
+        const std::complex<double> c2(delta_lambda[iat].x, -delta_lambda[iat].y);
+        const std::complex<double> c3(-delta_lambda[iat].z, 0.0);
+        for(int ib = 0; ib < nbands * npol; ib += npol)
+        {
+            for(int ip = 0; ip < nh_iat[iat]; ip++)
+            {
+                const int becpind = ib * nkb + sum + ip;
+                const std::complex<double> b1 = becp[becpind];
+                const std::complex<double> b2 = becp[becpind + nkb];
+                ps[becpind] += c0 * b1 + c2 * b2;
+                ps[becpind + nkb] += c1 * b1 + c3 * b2;
+            }
+        }
+        sum += nh_iat[iat];
+    }
+
+    // atom 0: lambda=(1,0,0), becp_up=(1,0), becp_dn=(0,0)
+    // ps_up[0] = (0,0)*(1,0) + (1,0)*(0,0) = 0
+    // ps_dn[0] = (1,0)*(1,0) + (0,0)*(0,0) = (1,0)
+    EXPECT_NEAR(ps[0].real(), 0.0, 1e-12);
+    EXPECT_NEAR(ps[0 + nkb].real(), 1.0, 1e-12);
+
+    // atom 1: lambda=(0,0,2), becp_up=(0,0), becp_dn=(1,0)
+    // ps_up[1] = (2,0)*(0,0) + (0,0)*(1,0) = 0
+    // ps_dn[1] = (0,0)*(0,0) + (-2,0)*(1,0) = (-2,0)
+    EXPECT_NEAR(ps[1].real(), 0.0, 1e-12);
+    EXPECT_NEAR(ps[1 + nkb].real(), -2.0, 1e-12);
+}
+
+TEST_F(DeltaSpinPwTest, DeltaHcc_Npol1_SignPositive)
+{
+    // npol=1: ps[becpind] += sign * lambda_z * becp1
+    const int nat = 1;
+    const int nproj = 2;
+    const int nkb = nproj;
+    const int nbands = 1;
+    const int sign = 1;
+    const double lambda_z = 0.5;
+
+    std::vector<std::complex<double>> becp(nbands * nkb, {0.0, 0.0});
+    becp[0] = {1.0, 0.3};
+    becp[1] = {0.0, -0.5};
+
+    std::vector<std::complex<double>> ps(nbands * nkb, {0.0, 0.0});
+    double coeff = lambda_z * sign;
+    int sum = 0;
+    for(int ib = 0; ib < nbands; ib++)
+    {
+        for(int ip = 0; ip < nproj; ip++)
+        {
+            const int becpind = ib * nkb + sum + ip;
+            ps[becpind] += coeff * becp[becpind];
+        }
+    }
+
+    // ps[0] = 0.5 * (1.0, 0.3) = (0.5, 0.15)
+    EXPECT_NEAR(ps[0].real(), 0.5, 1e-12);
+    EXPECT_NEAR(ps[0].imag(), 0.15, 1e-12);
+    // ps[1] = 0.5 * (0, -0.5) = (0, -0.25)
+    EXPECT_NEAR(ps[1].real(), 0.0, 1e-12);
+    EXPECT_NEAR(ps[1].imag(), -0.25, 1e-12);
+}
+
+TEST_F(DeltaSpinPwTest, DeltaHcc_Npol1_SignNegative)
+{
+    const int nkb = 1;
+    const int nbands = 1;
+    const int sign = -1;
+    const double lambda_z = 0.5;
+
+    std::vector<std::complex<double>> becp(nbands * nkb, {0.0, 0.0});
+    becp[0] = {1.0, 0.0};
+
+    std::vector<std::complex<double>> ps(nbands * nkb, {0.0, 0.0});
+    double coeff = lambda_z * sign;
+    ps[0] += coeff * becp[0];
+
+    EXPECT_NEAR(ps[0].real(), -0.5, 1e-12);
+    EXPECT_NEAR(ps[0].imag(), 0.0, 1e-12);
+}
+
+TEST_F(DeltaSpinPwTest, DeltaHcc_Npol2_ZeroLambda)
+{
+    // lambda = (0,0,0) => ps should remain zero
+    const int nkb = 2;
+    const int nbands = 1;
+    const int npol = 2;
+
+    std::vector<std::complex<double>> becp(nbands * npol * nkb, {0.0, 0.0});
+    becp[0] = {1.0, 0.5};
+    becp[1] = {0.3, -0.2};
+    becp[0 + nkb] = {0.7, 0.1};
+    becp[1 + nkb] = {-0.4, 0.8};
+
+    std::vector<std::complex<double>> ps(nbands * npol * nkb, {0.0, 0.0});
+
+    const std::complex<double> c0(0.0, 0.0);
+    const std::complex<double> c1(0.0, 0.0);
+    const std::complex<double> c2(0.0, 0.0);
+    const std::complex<double> c3(0.0, 0.0);
+
+    for(int ip = 0; ip < nkb; ip++)
+    {
+        ps[ip] += c0 * becp[ip] + c2 * becp[ip + nkb];
+        ps[ip + nkb] += c1 * becp[ip] + c3 * becp[ip + nkb];
+    }
+
+    for(int i = 0; i < nbands * npol * nkb; i++)
+    {
+        EXPECT_NEAR(ps[i].real(), 0.0, 1e-15);
+        EXPECT_NEAR(ps[i].imag(), 0.0, 1e-15);
+    }
+}
+
+// =====================================================================
+// cal_Mi_pw: magnetization accumulation from becp
+// =====================================================================
+
+TEST_F(DeltaSpinPwTest, MiPw_Npol1_SpinUp)
+{
+    // npol=1, nspin=2: Mi.z += sign * weight * |becp|^2
+    // spin-up (sign=+1)
+    const int nkb = 3;
+    const int nbands = 2;
+    const int sign = 1;
+    const double weights[2] = {1.0, 0.5};
+
+    std::vector<std::complex<double>> becp(nbands * nkb, {0.0, 0.0});
+    // band 0
+    becp[0] = {0.8, 0.0};
+    becp[1] = {0.0, 0.6};
+    becp[2] = {0.3, 0.4};
+    // band 1
+    becp[3] = {0.5, 0.0};
+    becp[4] = {0.0, 0.0};
+    becp[5] = {1.0, 0.0};
+
+    // Single atom with nh=3
+    double Mi_z = 0.0;
+    for(int ib = 0; ib < nbands; ib++)
+    {
+        const double weight = weights[ib];
+        double occ = 0.0;
+        for(int ih = 0; ih < nkb; ih++)
+        {
+            const int index = ib * nkb + ih;
+            occ += (std::conj(becp[index]) * becp[index]).real();
+        }
+        Mi_z += sign * weight * occ;
+    }
+
+    // band0: |0.8|^2 + |0.6|^2 + |0.3+0.4i|^2 = 0.64 + 0.36 + 0.25 = 1.25, w=1.0
+    // band1: |0.5|^2 + 0 + |1.0|^2 = 0.25 + 1.0 = 1.25, w=0.5
+    // Mi_z = 1*1.25 + 0.5*1.25 = 1.875
+    EXPECT_NEAR(Mi_z, 1.875, 1e-12);
+}
+
+TEST_F(DeltaSpinPwTest, MiPw_Npol1_SpinDown)
+{
+    // spin-down (sign=-1)
+    const int nkb = 1;
+    const int nbands = 1;
+    const int sign = -1;
+    const double weight = 2.0;
+
+    std::vector<std::complex<double>> becp(1, {0.0, 0.0});
+    becp[0] = {0.6, 0.8}; // |becp|^2 = 0.36 + 0.64 = 1.0
+
+    double Mi_z = 0.0;
+    double occ = (std::conj(becp[0]) * becp[0]).real();
+    Mi_z += sign * weight * occ;
+
+    EXPECT_NEAR(Mi_z, -2.0, 1e-12);
+}
+
+TEST_F(DeltaSpinPwTest, MiPw_Npol2_PureZMag)
+{
+    // npol=2: construct becp so that only z-component is nonzero
+    // becp_up = (a, 0), becp_dn = (0, 0)
+    // occ[0] = |a|^2, occ[1]=0, occ[2]=0, occ[3]=0
+    // Mi.z = w*(occ0-occ3) = w*|a|^2, Mi.x = 0, Mi.y = 0
+    const int nkb = 1;
+    const int nbands = 1;
+    const double weight = 1.0;
+
+    std::vector<std::complex<double>> becp(nbands * 2 * nkb, {0.0, 0.0});
+    becp[0] = {0.7, 0.0};       // becp_up
+    becp[0 + nkb] = {0.0, 0.0}; // becp_dn
+
+    double Mi_x = 0.0, Mi_y = 0.0, Mi_z = 0.0;
+    std::complex<double> occ[4] = {{0,0},{0,0},{0,0},{0,0}};
+    occ[0] = std::conj(becp[0]) * becp[0];
+    occ[1] = std::conj(becp[0]) * becp[0 + nkb];
+    occ[2] = std::conj(becp[0 + nkb]) * becp[0];
+    occ[3] = std::conj(becp[0 + nkb]) * becp[0 + nkb];
+
+    Mi_z += weight * (occ[0] - occ[3]).real();
+    Mi_x += weight * (occ[1] + occ[2]).real();
+    Mi_y += weight * (occ[1] - occ[2]).imag();
+
+    EXPECT_NEAR(Mi_z, 0.49, 1e-12);
+    EXPECT_NEAR(Mi_x, 0.0, 1e-15);
+    EXPECT_NEAR(Mi_y, 0.0, 1e-15);
+}
+
+TEST_F(DeltaSpinPwTest, MiPw_Npol2_PureXMag)
+{
+    // Construct becp so that only x-component is nonzero
+    // becp_up = (a, 0), becp_dn = (a, 0) with same magnitude
+    // occ[0] = |a|^2, occ[1] = |a|^2, occ[2] = |a|^2, occ[3] = |a|^2
+    // Mi.z = w*(occ0-occ3) = 0
+    // Mi.x = w*(occ1+occ2).real = w*2*|a|^2
+    // Mi.y = w*(occ1-occ2).imag = 0
+    const int nkb = 1;
+    const int nbands = 1;
+    const double weight = 1.0;
+    const double a = 0.5;
+
+    std::vector<std::complex<double>> becp(nbands * 2 * nkb, {0.0, 0.0});
+    becp[0] = {a, 0.0};
+    becp[0 + nkb] = {a, 0.0};
+
+    std::complex<double> occ[4];
+    occ[0] = std::conj(becp[0]) * becp[0];
+    occ[1] = std::conj(becp[0]) * becp[0 + nkb];
+    occ[2] = std::conj(becp[0 + nkb]) * becp[0];
+    occ[3] = std::conj(becp[0 + nkb]) * becp[0 + nkb];
+
+    double Mi_z = weight * (occ[0] - occ[3]).real();
+    double Mi_x = weight * (occ[1] + occ[2]).real();
+    double Mi_y = weight * (occ[1] - occ[2]).imag();
+
+    EXPECT_NEAR(Mi_z, 0.0, 1e-15);
+    EXPECT_NEAR(Mi_x, 0.5, 1e-12); // 2*0.25
+    EXPECT_NEAR(Mi_y, 0.0, 1e-15);
+}
+
+TEST_F(DeltaSpinPwTest, MiPw_Npol2_MixedMag)
+{
+    // General becp: verify all three components
+    const int nkb = 1;
+    const int nbands = 1;
+    const double weight = 1.0;
+
+    std::vector<std::complex<double>> becp(nbands * 2 * nkb, {0.0, 0.0});
+    becp[0] = {0.8, 0.0};        // becp_up
+    becp[0 + nkb] = {0.0, 0.6};  // becp_dn
+
+    std::complex<double> occ[4];
+    occ[0] = std::conj(becp[0]) * becp[0];           // 0.64
+    occ[1] = std::conj(becp[0]) * becp[0 + nkb];     // 0.8*(0,0.6) = (0, 0.48)
+    occ[2] = std::conj(becp[0 + nkb]) * becp[0];     // (0,-0.6)*0.8 = (0, -0.48)
+    occ[3] = std::conj(becp[0 + nkb]) * becp[0 + nkb]; // 0.36
+
+    double Mi_z = weight * (occ[0] - occ[3]).real();
+    double Mi_x = weight * (occ[1] + occ[2]).real();
+    double Mi_y = weight * (occ[1] - occ[2]).imag();
+
+    EXPECT_NEAR(Mi_z, 0.28, 1e-12);  // 0.64 - 0.36
+    EXPECT_NEAR(Mi_x, 0.0, 1e-15);   // (0,0.48)+(0,-0.48) = 0
+    EXPECT_NEAR(Mi_y, 0.96, 1e-12);  // imag((0,0.48)-(0,-0.48)) = imag(0,0.96) = 0.96
+}
+
+TEST_F(DeltaSpinPwTest, MiPw_MultiAtom_BeginIhOffset)
+{
+    // Two atoms with different nh, verify begin_ih offset
+    const int nat = 2;
+    const int nh_0 = 2, nh_1 = 1;
+    const int nkb = nh_0 + nh_1; // 3
+    const int nbands = 1;
+    const double weight = 1.0;
+    const int sign = 1;
+
+    std::vector<std::complex<double>> becp(nbands * nkb, {0.0, 0.0});
+    // atom 0: ih=0,1
+    becp[0] = {1.0, 0.0}; // |becp|^2 = 1.0
+    becp[1] = {0.0, 1.0}; // |becp|^2 = 1.0
+    // atom 1: ih=2
+    becp[2] = {0.5, 0.5}; // |becp|^2 = 0.5
+
+    int nh_iat[2] = {nh_0, nh_1};
+    double Mi_z[2] = {0.0, 0.0};
+
+    for(int ib = 0; ib < nbands; ib++)
+    {
+        int begin_ih = 0;
+        for(int iat = 0; iat < nat; iat++)
+        {
+            double occ = 0.0;
+            for(int ih = 0; ih < nh_iat[iat]; ih++)
+            {
+                const int index = ib * nkb + begin_ih + ih;
+                occ += (std::conj(becp[index]) * becp[index]).real();
+            }
+            Mi_z[iat] += sign * weight * occ;
+            begin_ih += nh_iat[iat];
+        }
+    }
+
+    EXPECT_NEAR(Mi_z[0], 2.0, 1e-12); // 1.0 + 1.0
+    EXPECT_NEAR(Mi_z[1], 0.5, 1e-12); // 0.5
+}
+
+// =====================================================================
+// cal_mw_from_lambda: magnetization re-accumulation from becp_tmp
+// =====================================================================
+
+TEST_F(DeltaSpinPwTest, MwFromLambda_Npol2_Accumulation)
+{
+    // Same formula as cal_Mi_pw npol=2, but from becp_tmp
+    const int nkb = 1;
+    const int nbands = 1;
+    const int npol = 2;
+    const int nk = 2;
+    const double weights[2] = {1.0, 0.5};
+
+    const int size_becp = nbands * nkb * npol;
+    std::vector<std::complex<double>> becp_tmp(size_becp * nk, {0.0, 0.0});
+    // k=0
+    becp_tmp[0] = {0.8, 0.0};       // becp_up
+    becp_tmp[0 + nkb] = {0.0, 0.6}; // becp_dn
+    // k=1
+    becp_tmp[size_becp + 0] = {0.6, 0.0};
+    becp_tmp[size_becp + 0 + nkb] = {0.0, 0.8};
+
+    double Mi_x = 0.0, Mi_y = 0.0, Mi_z = 0.0;
+    int nh_iat[1] = {1};
+
+    for(int ik = 0; ik < nk; ik++)
+    {
+        const std::complex<double>* becp = &becp_tmp[ik * size_becp];
+        for(int ib = 0; ib < nbands; ib++)
+        {
+            const double weight = weights[ik];
+            int begin_ih = 0;
+            for(int iat = 0; iat < 1; iat++)
+            {
+                std::complex<double> occ[4] = {{0,0},{0,0},{0,0},{0,0}};
+                for(int ih = 0; ih < nh_iat[iat]; ih++)
+                {
+                    const int index = ib * npol * nkb + begin_ih + ih;
+                    occ[0] += std::conj(becp[index]) * becp[index];
+                    occ[1] += std::conj(becp[index]) * becp[index + nkb];
+                    occ[2] += std::conj(becp[index + nkb]) * becp[index];
+                    occ[3] += std::conj(becp[index + nkb]) * becp[index + nkb];
+                }
+                Mi_x += weight * (occ[1] + occ[2]).real();
+                Mi_y += weight * (occ[1] - occ[2]).imag();
+                Mi_z += weight * (occ[0] - occ[3]).real();
+                begin_ih += nh_iat[iat];
+            }
+        }
+    }
+
+    // k=0, w=1.0: occ0=0.64, occ3=0.36 => dz=0.28, occ1=(0,0.48), occ2=(0,-0.48) => dx=0, dy=0.96
+    // k=1, w=0.5: occ0=0.36, occ3=0.64 => dz=-0.28*0.5=-0.14, occ1=(0,0.48), occ2=(0,-0.48) => dy=0.96*0.5=0.48
+    EXPECT_NEAR(Mi_z, 0.14, 1e-12);  // 0.28 - 0.14
+    EXPECT_NEAR(Mi_x, 0.0, 1e-15);
+    EXPECT_NEAR(Mi_y, 1.44, 1e-12);  // 0.96 + 0.48
+}
+
+TEST_F(DeltaSpinPwTest, MwFromLambda_Npol1_SignHandling)
+{
+    // npol=1: isk[ik]=0 => sign=+1, isk[ik]=1 => sign=-1
+    const int nkb = 1;
+    const int nbands = 1;
+    const int nk = 2;
+    const double weight = 1.0;
+    const int isk[2] = {0, 1}; // first k spin-up, second k spin-down
+
+    std::vector<std::complex<double>> becp_tmp(nbands * nkb * nk, {0.0, 0.0});
+    becp_tmp[0] = {0.5, 0.0}; // k=0: |becp|^2 = 0.25
+    becp_tmp[1] = {0.5, 0.0}; // k=1: |becp|^2 = 0.25
+
+    double Mi_z = 0.0;
+    for(int ik = 0; ik < nk; ik++)
+    {
+        const int sign = (isk[ik] == 0) ? 1 : -1;
+        const std::complex<double>* becp = &becp_tmp[ik * nbands * nkb];
+        for(int ib = 0; ib < nbands; ib++)
+        {
+            double occ = 0.0;
+            for(int ih = 0; ih < nkb; ih++)
+            {
+                const int index = ib * nkb + ih;
+                occ += (std::conj(becp[index]) * becp[index]).real();
+            }
+            Mi_z += weight * occ * sign;
+        }
+    }
+
+    // k=0: +1 * 1.0 * 0.25 = 0.25
+    // k=1: -1 * 1.0 * 0.25 = -0.25
+    EXPECT_NEAR(Mi_z, 0.0, 1e-15);
+}
+
+// =====================================================================
+// DeltaHcc gemm contribution: h_tmp += becp^H * ps
+// =====================================================================
+
+TEST_F(DeltaSpinPwTest, DeltaHcc_GemmContribution)
+{
+    // Verify h_tmp += becp^H * ps for a small 2x2 case
+    // becp: (npm x nbands), ps: (npm x nbands)
+    // h_tmp += becp^H * ps = (nbands x npm) * (npm x nbands)
+    const int nbands = 2;
+    const int npm = 2; // nkb * npol
+
+    // becp^H means conjugate transpose
+    std::vector<std::complex<double>> becp = {
+        {1.0, 0.0}, {0.0, 1.0},  // column 0: becp[0,0], becp[1,0]
+        {0.5, 0.0}, {0.0, -0.5}  // column 1: becp[0,1], becp[1,1]
+    };
+    std::vector<std::complex<double>> ps = {
+        {0.5, 0.0}, {0.0, 0.5},
+        {0.3, 0.0}, {0.0, -0.3}
+    };
+
+    // Manual: h_tmp[i,j] += sum_k conj(becp[k,i]) * ps[k,j]
+    // becp stored column-major as becp[k + i*npm], ps stored as ps[k + j*npm]
+    std::vector<std::complex<double>> h_tmp(nbands * nbands, {0.0, 0.0});
+    for(int i = 0; i < nbands; i++)
+    {
+        for(int j = 0; j < nbands; j++)
+        {
+            for(int k = 0; k < npm; k++)
+            {
+                h_tmp[i * nbands + j] += std::conj(becp[k + i * npm]) * ps[k + j * npm];
+            }
+        }
+    }
+
+    // h[0,0] = conj(1)*0.5 + conj(0,1)*(0,0.5) = 0.5 + (0,-1)*(0,0.5) = 0.5 + 0.5 = 1.0
+    EXPECT_NEAR(h_tmp[0].real(), 1.0, 1e-12);
+    EXPECT_NEAR(h_tmp[0].imag(), 0.0, 1e-12);
+
+    // h[0,1] = conj(1)*0.3 + conj(0,1)*(0,-0.3) = 0.3 + (0,-1)*(0,-0.3) = 0.3 + (-0.3) = 0
+    EXPECT_NEAR(h_tmp[1].real(), 0.0, 1e-12);
+    EXPECT_NEAR(h_tmp[1].imag(), 0.0, 1e-12);
+
+    // h[1,0] = conj(0.5)*0.5 + conj(0,-0.5)*(0,0.5) = 0.25 + (0,0.5)*(0,0.5) = 0.25 + (-0.25) = 0
+    EXPECT_NEAR(h_tmp[2].real(), 0.0, 1e-12);
+    EXPECT_NEAR(h_tmp[2].imag(), 0.0, 1e-12);
+
+    // h[1,1] = conj(0.5)*0.3 + conj(0,-0.5)*(0,-0.3) = 0.15 + (0,0.5)*(0,-0.3) = 0.15 + 0.15 = 0.3
+    EXPECT_NEAR(h_tmp[3].real(), 0.3, 1e-12);
+    EXPECT_NEAR(h_tmp[3].imag(), 0.0, 1e-12);
+}
diff --git a/source/source_lcao/module_deltaspin/test/lambda_update_strategies_test.cpp b/source/source_lcao/module_deltaspin/test/lambda_update_strategies_test.cpp
new file mode 100644
index 00000000000..b196bfe030c
--- /dev/null
+++ b/source/source_lcao/module_deltaspin/test/lambda_update_strategies_test.cpp
@@ -0,0 +1,479 @@
+#include "../lambda_update_strategies.h"
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+#include <cmath>
+#include <vector>
+#include <string>
+
+/************************************************
+ *  Unit tests for lambda update strategies
+ *
+ *  - Tested Strategies:
+ *    - LinearResponseUpdate (Scheme B)
+ *    - AugmentedLagrangianUpdate (Scheme C)
+ *    - HybridDelayedUpdate (Scheme D)
+ *
+ *  - Tested Helpers:
+ *    - compute_rms_error()
+ *    - count_converged()
+ *    - cap_lambda()
+ ************************************************/
+
+namespace
+{
+
+using ModuleBase::Vector3;
+
+// ===================================================================
+// Helper function tests
+// ===================================================================
+
+class LambdaUpdateHelpersTest : public ::testing::Test
+{
+  protected:
+    int nat;
+    std::vector<Vector3<double>> Mi;
+    std::vector<Vector3<double>> target_mag;
+    std::vector<Vector3<int>> constrain;
+
+    void SetUp() override
+    {
+        nat = 3;
+        Mi.push_back(Vector3<double>(1.0, 0.5, 0.3));
+        Mi.push_back(Vector3<double>(-0.8, 0.2, 0.1));
+        Mi.push_back(Vector3<double>(0.5, 0.5, 0.5));
+
+        target_mag.push_back(Vector3<double>(2.0, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(-1.0, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(0.5, 0.5, 0.5));
+
+        constrain.push_back(Vector3<int>(1, 1, 0));
+        constrain.push_back(Vector3<int>(1, 0, 0));
+        constrain.push_back(Vector3<int>(1, 1, 1));
+    }
+};
+
+TEST_F(LambdaUpdateHelpersTest, ComputeRmsError)
+{
+    double rms = spinconstrain::compute_rms_error(Mi, target_mag, constrain, nat);
+    // Constrained: atom0(x,y), atom1(x), atom2(x,y,z) = 6 components
+    double expected_sum = 1.0*1.0 + 0.5*0.5 + 0.2*0.2 + 0.0 + 0.0 + 0.0;
+    double expected_rms = std::sqrt(expected_sum / 6.0);
+    EXPECT_NEAR(rms, expected_rms, 1e-10);
+}
+
+TEST_F(LambdaUpdateHelpersTest, ComputeRmsErrorAlreadyConverged)
+{
+    Mi[0] = target_mag[0];
+    Mi[1] = target_mag[1];
+    Mi[2] = target_mag[2];
+    double rms = spinconstrain::compute_rms_error(Mi, target_mag, constrain, nat);
+    EXPECT_NEAR(rms, 0.0, 1e-15);
+}
+
+TEST_F(LambdaUpdateHelpersTest, ComputeRmsErrorNoConstraints)
+{
+    std::vector<Vector3<int>> no_constrain(nat, Vector3<int>(0, 0, 0));
+    double rms = spinconstrain::compute_rms_error(Mi, target_mag, no_constrain, nat);
+    EXPECT_NEAR(rms, 0.0, 1e-15);
+}
+
+TEST_F(LambdaUpdateHelpersTest, CountConverged)
+{
+    int n = spinconstrain::count_converged(Mi, target_mag, constrain, 0.3, nat);
+    EXPECT_EQ(n, 4); // 1 from atom1 + 3 from atom2
+}
+
+TEST_F(LambdaUpdateHelpersTest, CountConvergedAll)
+{
+    Mi[0] = target_mag[0];
+    Mi[1] = target_mag[1];
+    Mi[2] = target_mag[2];
+    int n = spinconstrain::count_converged(Mi, target_mag, constrain, 1e-6, nat);
+    EXPECT_EQ(n, 6);
+}
+
+TEST_F(LambdaUpdateHelpersTest, CapLambda)
+{
+    std::vector<Vector3<double>> lam(nat);
+    lam[0] = Vector3<double>(15.0, -20.0, 5.0);
+    lam[1] = Vector3<double>(0.0, 8.0, -12.0);
+    lam[2] = Vector3<double>(3.0, 3.0, 3.0);
+
+    std::vector<Vector3<int>> con(nat);
+    con[0] = Vector3<int>(1, 1, 1);
+    con[1] = Vector3<int>(0, 1, 0);
+    con[2] = Vector3<int>(1, 1, 1);
+
+    spinconstrain::cap_lambda(lam, con, 10.0, nat);
+
+    EXPECT_NEAR(lam[0][0], 10.0, 1e-10);
+    EXPECT_NEAR(lam[0][1], -10.0, 1e-10);
+    EXPECT_NEAR(lam[0][2], 5.0, 1e-10);
+    EXPECT_NEAR(lam[1][0], 0.0, 1e-10);
+    EXPECT_NEAR(lam[1][1], 8.0, 1e-10);
+    EXPECT_NEAR(lam[1][2], -12.0, 1e-10);
+    EXPECT_NEAR(lam[2][0], 3.0, 1e-10);
+    EXPECT_NEAR(lam[2][1], 3.0, 1e-10);
+    EXPECT_NEAR(lam[2][2], 3.0, 1e-10);
+}
+
+// ===================================================================
+// Scheme B: Linear Response Update tests
+// ===================================================================
+
+class LinearResponseTest : public ::testing::Test
+{
+  protected:
+    int nat;
+    std::vector<Vector3<double>> lambda;
+    std::vector<Vector3<double>> Mi;
+    std::vector<Vector3<double>> target_mag;
+    std::vector<Vector3<int>> constrain;
+
+    void SetUp() override
+    {
+        nat = 2;
+        lambda.push_back(Vector3<double>(0.0, 0.0, 0.0));
+        lambda.push_back(Vector3<double>(0.0, 0.0, 0.0));
+        Mi.push_back(Vector3<double>(1.0, 0.0, 0.0));
+        Mi.push_back(Vector3<double>(-0.5, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(2.0, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(-1.0, 0.0, 0.0));
+        constrain.push_back(Vector3<int>(1, 1, 1));
+        constrain.push_back(Vector3<int>(1, 1, 1));
+    }
+};
+
+TEST_F(LinearResponseTest, FirstUpdateNoHistory)
+{
+    spinconstrain::LinearResponseUpdate updater(0.01, 100.0, 0.3, 10.0);
+    EXPECT_EQ(updater.name(), "LinearResponse");
+    EXPECT_FALSE(updater.is_converged());
+
+    auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 0, nat);
+
+    EXPECT_NEAR(lambda[0][0], 0.3, 1e-10);
+    EXPECT_NEAR(lambda[0][1], 0.0, 1e-10);
+    EXPECT_LT(result.max_lambda, 1.0);
+    EXPECT_EQ(result.status, "updating");
+}
+
+TEST_F(LinearResponseTest, ConvergesAfterMultipleSteps)
+{
+    spinconstrain::LinearResponseUpdate updater(0.01, 100.0, 0.5, 10.0);
+    double chi = 1.0;
+    Vector3<double> Mi_init_0 = Mi[0];
+    Vector3<double> Mi_init_1 = Mi[1];
+
+    int max_iter = 50;
+    int converged_iter = -1;
+    for (int iter = 0; iter < max_iter; ++iter)
+    {
+        auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-5, iter, nat);
+        Mi[0] = Vector3<double>(Mi_init_0.x + chi * lambda[0][0],
+                                Mi_init_0.y + chi * lambda[0][1],
+                                Mi_init_0.z + chi * lambda[0][2]);
+        Mi[1] = Vector3<double>(Mi_init_1.x + chi * lambda[1][0],
+                                Mi_init_1.y + chi * lambda[1][1],
+                                Mi_init_1.z + chi * lambda[1][2]);
+        if (updater.is_converged())
+        {
+            EXPECT_LT(result.rms_error, 1e-5);
+            converged_iter = iter;
+            break;
+        }
+    }
+    EXPECT_GE(converged_iter, 0) << "Linear response did not converge within " << max_iter;
+
+    double expected_l0 = (target_mag[0][0] - Mi_init_0.x) / chi;
+    double expected_l1 = (target_mag[1][0] - Mi_init_1.x) / chi;
+    EXPECT_NEAR(lambda[0][0], expected_l0, 0.1);
+    EXPECT_NEAR(lambda[1][0], expected_l1, 0.1);
+}
+
+TEST_F(LinearResponseTest, RespectsConstrainFlags)
+{
+    std::vector<Vector3<int>> partial_constrain(nat);
+    partial_constrain[0] = Vector3<int>(1, 0, 0);
+    partial_constrain[1] = Vector3<int>(0, 0, 0);
+
+    spinconstrain::LinearResponseUpdate updater(0.01, 100.0, 0.3, 10.0);
+    updater.update_lambda(lambda, Mi, target_mag, partial_constrain, 1e-6, 0, nat);
+
+    EXPECT_NEAR(lambda[0][0], 0.3, 1e-10);
+    EXPECT_NEAR(lambda[0][1], 0.0, 1e-10);
+    EXPECT_NEAR(lambda[1][0], 0.0, 1e-10);
+}
+
+TEST_F(LinearResponseTest, CapsLambda)
+{
+    target_mag[0] = Vector3<double>(100.0, 0.0, 0.0);
+    spinconstrain::LinearResponseUpdate updater(0.01, 100.0, 1.0, 5.0);
+    updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 0, nat);
+    EXPECT_LE(std::abs(lambda[0][0]), 5.0 + 1e-10);
+}
+
+TEST_F(LinearResponseTest, ChiEstimation)
+{
+    spinconstrain::LinearResponseUpdate updater(0.01, 100.0, 0.5, 10.0);
+    double chi_true = 2.0;
+    Vector3<double> Mi_init = Mi[0];
+
+    for (int iter = 0; iter < 5; ++iter)
+    {
+        updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, iter, nat);
+        Mi[0] = Vector3<double>(Mi_init.x + chi_true * lambda[0][0], 0.0, 0.0);
+        Mi[1] = Vector3<double>(-0.5, 0.0, 0.0);
+    }
+
+    const auto& chi = updater.get_chi();
+    EXPECT_GT(chi[0][0], 0.5);
+    EXPECT_LT(chi[0][0], 50.0);
+}
+
+// ===================================================================
+// Scheme C: Augmented Lagrangian Update tests
+// ===================================================================
+
+class AugmentedLagrangianTest : public ::testing::Test
+{
+  protected:
+    int nat;
+    std::vector<Vector3<double>> lambda;
+    std::vector<Vector3<double>> Mi;
+    std::vector<Vector3<double>> target_mag;
+    std::vector<Vector3<int>> constrain;
+
+    void SetUp() override
+    {
+        nat = 2;
+        lambda.push_back(Vector3<double>(0.0, 0.0, 0.0));
+        lambda.push_back(Vector3<double>(0.0, 0.0, 0.0));
+        Mi.push_back(Vector3<double>(1.0, 0.0, 0.0));
+        Mi.push_back(Vector3<double>(-0.5, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(2.0, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(-1.0, 0.0, 0.0));
+        constrain.push_back(Vector3<int>(1, 0, 0));
+        constrain.push_back(Vector3<int>(1, 0, 0));
+    }
+};
+
+TEST_F(AugmentedLagrangianTest, FirstUpdate)
+{
+    spinconstrain::AugmentedLagrangianUpdate updater(0.1, 10.0, 1.5, 5, 10.0);
+    EXPECT_EQ(updater.name(), "AugmentedLagrangian");
+
+    auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 0, nat);
+
+    EXPECT_NEAR(lambda[0][0], -0.1, 1e-10);
+    EXPECT_NEAR(lambda[0][1], 0.0, 1e-10);
+    EXPECT_NEAR(lambda[1][0], 0.05, 1e-10);
+    EXPECT_NEAR(updater.get_mu(), 0.1, 1e-10);
+    EXPECT_FALSE(updater.is_converged());
+}
+
+TEST_F(AugmentedLagrangianTest, MuGrowth)
+{
+    spinconstrain::AugmentedLagrangianUpdate updater(0.1, 10.0, 2.0, 3, 10.0);
+    for (int iter = 0; iter < 10; ++iter)
+    {
+        updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, iter, nat);
+    }
+    EXPECT_NEAR(updater.get_mu(), 0.8, 1e-10);
+}
+
+TEST_F(AugmentedLagrangianTest, MuCappedAtMax)
+{
+    spinconstrain::AugmentedLagrangianUpdate updater(0.1, 1.0, 2.0, 1, 10.0);
+    for (int iter = 0; iter < 10; ++iter)
+    {
+        updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, iter, nat);
+    }
+    EXPECT_NEAR(updater.get_mu(), 1.0, 1e-10);
+}
+
+TEST_F(AugmentedLagrangianTest, ConvergesWithInvertedResponse)
+{
+    // Inverted response model: Mi = M_target - chi * lambda
+    // Increasing lambda REDUCES the error — models constraint physics correctly
+    spinconstrain::AugmentedLagrangianUpdate updater(0.1, 10.0, 1.5, 5, 10.0);
+    double chi = 1.0;
+
+    int max_iter = 100;
+    int converged_iter = -1;
+    for (int iter = 0; iter < max_iter; ++iter)
+    {
+        auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-3, iter, nat);
+
+        // Inverted response: Mi approaches M_target as lambda → 0
+        Mi[0] = Vector3<double>(target_mag[0][0] - chi * lambda[0][0], 0.0, 0.0);
+        Mi[1] = Vector3<double>(target_mag[1][0] - chi * lambda[1][0], 0.0, 0.0);
+
+        if (updater.is_converged())
+        {
+            EXPECT_LT(result.rms_error, 1e-3);
+            converged_iter = iter;
+            break;
+        }
+    }
+
+    EXPECT_GE(converged_iter, 0) << "AL did not converge within " << max_iter;
+    EXPECT_NEAR(lambda[0][0], 0.0, 0.5);
+}
+
+TEST_F(AugmentedLagrangianTest, ResetMu)
+{
+    spinconstrain::AugmentedLagrangianUpdate updater(0.1, 10.0, 2.0, 1, 10.0);
+    for (int iter = 0; iter < 5; ++iter)
+    {
+        updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, iter, nat);
+    }
+    EXPECT_GT(updater.get_mu(), 0.1);
+    updater.reset_mu();
+    EXPECT_NEAR(updater.get_mu(), 0.1, 1e-10);
+}
+
+// ===================================================================
+// Scheme D: Hybrid Delayed Update tests
+// ===================================================================
+
+class HybridDelayedTest : public ::testing::Test
+{
+  protected:
+    int nat;
+    std::vector<Vector3<double>> lambda;
+    std::vector<Vector3<double>> Mi;
+    std::vector<Vector3<double>> target_mag;
+    std::vector<Vector3<int>> constrain;
+
+    void SetUp() override
+    {
+        nat = 2;
+        lambda.push_back(Vector3<double>(0.0, 0.0, 0.0));
+        lambda.push_back(Vector3<double>(0.0, 0.0, 0.0));
+        Mi.push_back(Vector3<double>(1.0, 0.0, 0.0));
+        Mi.push_back(Vector3<double>(-0.5, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(2.0, 0.0, 0.0));
+        target_mag.push_back(Vector3<double>(-1.0, 0.0, 0.0));
+        constrain.push_back(Vector3<int>(1, 1, 1));
+        constrain.push_back(Vector3<int>(1, 1, 1));
+    }
+};
+
+TEST_F(HybridDelayedTest, EarlyPhaseSkip)
+{
+    spinconstrain::HybridDelayedUpdate updater(1e-3, 0.1, 10.0, 1.5, 5, 10, 10.0);
+    updater.set_drho(1.0);
+
+    auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 0, nat);
+    EXPECT_EQ(result.status, "skipped_early");
+    EXPECT_EQ(updater.get_phase(), "early");
+    EXPECT_NEAR(lambda[0][0], 0.0, 1e-10);
+}
+
+TEST_F(HybridDelayedTest, MidPhaseUpdate)
+{
+    spinconstrain::HybridDelayedUpdate updater(1e-3, 0.1, 10.0, 1.5, 5, 10, 10.0);
+    updater.set_drho(5e-3);
+
+    auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 0, nat);
+    EXPECT_EQ(updater.get_phase(), "mid");
+    EXPECT_NEAR(lambda[0][0], -0.1, 1e-10);
+}
+
+TEST_F(HybridDelayedTest, LatePhaseUpdate)
+{
+    spinconstrain::HybridDelayedUpdate updater(1e-3, 0.1, 10.0, 1.5, 5, 10, 10.0);
+    updater.set_drho(1e-5);
+
+    auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 0, nat);
+    EXPECT_EQ(updater.get_phase(), "late");
+    EXPECT_NEAR(lambda[0][0], -0.1, 1e-10);
+}
+
+TEST_F(HybridDelayedTest, FallbackSignal)
+{
+    spinconstrain::HybridDelayedUpdate updater(1e-3, 0.1, 10.0, 1.5, 5, 10, 10.0);
+    updater.set_drho(1e-5);
+
+    for (int iter = 0; iter < 5; ++iter)
+    {
+        auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, iter, nat);
+        if (iter >= 2 && result.status == "fallback_triggered")
+        {
+            EXPECT_TRUE(true);
+            return;
+        }
+    }
+    FAIL() << "Fallback was not signaled after several iterations";
+}
+
+TEST_F(HybridDelayedTest, Reset)
+{
+    spinconstrain::HybridDelayedUpdate updater(1e-3, 0.1, 10.0, 1.5, 5, 10, 10.0);
+    updater.set_drho(1e-5);
+    for (int iter = 0; iter < 10; ++iter)
+    {
+        updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, iter, nat);
+    }
+    updater.reset();
+    EXPECT_EQ(updater.get_phase(), "early");
+}
+
+TEST_F(HybridDelayedTest, PhaseTransitions)
+{
+    spinconstrain::HybridDelayedUpdate updater(1e-3, 0.1, 10.0, 1.5, 5, 10, 10.0);
+
+    updater.set_drho(1.0);
+    auto r1 = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 0, nat);
+    EXPECT_EQ(updater.get_phase(), "early");
+    EXPECT_EQ(r1.status, "skipped_early");
+
+    updater.set_drho(5e-3);
+    updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 1, nat);
+    EXPECT_EQ(updater.get_phase(), "mid");
+
+    updater.set_drho(1e-5);
+    updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-6, 2, nat);
+    EXPECT_EQ(updater.get_phase(), "late");
+}
+
+TEST_F(HybridDelayedTest, ConvergesWithInvertedResponse)
+{
+    spinconstrain::HybridDelayedUpdate updater(1e-3, 0.1, 10.0, 1.5, 5, 10, 10.0);
+    updater.set_drho(1e-5);
+    double chi = 1.0;
+
+    int max_iter = 100;
+    int converged_iter = -1;
+    for (int iter = 0; iter < max_iter; ++iter)
+    {
+        auto result = updater.update_lambda(lambda, Mi, target_mag, constrain, 1e-3, iter, nat);
+
+        Mi[0] = Vector3<double>(target_mag[0][0] - chi * lambda[0][0],
+                                target_mag[0][1] - chi * lambda[0][1],
+                                target_mag[0][2] - chi * lambda[0][2]);
+        Mi[1] = Vector3<double>(target_mag[1][0] - chi * lambda[1][0],
+                                target_mag[1][1] - chi * lambda[1][1],
+                                target_mag[1][2] - chi * lambda[1][2]);
+
+        if (updater.is_converged())
+        {
+            EXPECT_LT(result.rms_error, 1e-3);
+            converged_iter = iter;
+            break;
+        }
+    }
+
+    EXPECT_GE(converged_iter, 0) << "Hybrid did not converge within " << max_iter
+                                  << ". Final phase: " << updater.get_phase();
+}
+
+} // namespace
+
+int main(int argc, char** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/source/source_lcao/module_dftu/CMakeLists.txt b/source/source_lcao/module_dftu/CMakeLists.txt
index 42a58af7ba6..f41322b665c 100644
--- a/source/source_lcao/module_dftu/CMakeLists.txt
+++ b/source/source_lcao/module_dftu/CMakeLists.txt
@@ -19,3 +19,7 @@ add_library(
 if(ENABLE_COVERAGE)
   add_coverage(dftu)
 endif()
+
+if(BUILD_TESTING)
+  add_subdirectory(test)
+endif()
diff --git a/source/source_lcao/module_dftu/dftu.cpp b/source/source_lcao/module_dftu/dftu.cpp
index 2680aed37a6..f3f306ad61e 100644
--- a/source/source_lcao/module_dftu/dftu.cpp
+++ b/source/source_lcao/module_dftu/dftu.cpp
@@ -33,6 +33,7 @@ double Plus_U::uramping = 0.0; // increase U by uramping, default is -1.0
 int Plus_U::omc=0; // occupation matrix control
 
 int Plus_U::mixing_dftu=0; //whether to mix locale
+int Plus_U::nspin=0;
 
 bool Plus_U::Yukawa=false; // whether to use Yukawa potential
 
@@ -73,6 +74,7 @@ void Plus_U::init(UnitCell& cell, // unitcell class
     const int npol = PARAM.globalv.npol;     // number of polarization directions
     const int nlocal = PARAM.globalv.nlocal; // number of total local orbitals
     const int nspin = PARAM.inp.nspin;   // number of spins
+    Plus_U::nspin = nspin;
 
     // mohan update 2025-11-06
     Plus_U::energy_u = 0.0;
@@ -89,6 +91,10 @@ void Plus_U::init(UnitCell& cell, // unitcell class
     // it:index of type of atom
     for (int it = 0; it < cell.ntype; ++it)
     {
+        if(!has_correlated_orbital(it))
+        {
+            continue;
+        }
         for (int ia = 0; ia < cell.atoms[it].na; ia++)
         {
             // ia:index of atoms of this type
@@ -98,9 +104,28 @@ void Plus_U::init(UnitCell& cell, // unitcell class
             locale[iat].resize(cell.atoms[it].nwl + 1);
             locale_save[iat].resize(cell.atoms[it].nwl + 1);
 
-            const int tlp1_npol = (this->orbital_corr[it]*2+1)*npol;
-            this->eff_pot_pw_index[iat] = pot_index;
-            pot_index += tlp1_npol * tlp1_npol;
+            const int tlp1_npol = (get_orbital_corr(it)*2+1)*npol;
+            const int tlp1 = 2 * get_orbital_corr(it) + 1;
+            const int elem_size = tlp1 * tlp1;
+    // eff_pot_pw_index: per-atom offset into eff_pot_pw (and uom_array)
+    //
+    // nspin=1: offset = sum(tlp1^2 for preceding atoms), total = sum(all tlp1^2)
+    // nspin=2: same per-spin-channel offset; after the loop, pot_index *= 2
+    //          to create split layout: [all_spin_up | all_spin_down]
+    //          spin-up  at eff_pot_pw[eff_pot_pw_index[iat] + mm]
+    //          spin-down at eff_pot_pw[size/2 + eff_pot_pw_index[iat] + mm]
+    // nspin=4: offset = sum(tlp1_npol^2) where tlp1_npol = (2l+1)*npol = 2*(2l+1)
+    //          each atom occupies (2*tlp1)^2 = 4*tlp1^2 entries for 4 Pauli blocks
+    if(nspin == 4)
+    {
+        this->eff_pot_pw_index[iat] = pot_index;
+        pot_index += tlp1_npol * tlp1_npol;
+    }
+    else // nspin=1 or nspin=2: one tlp1^2 block per atom per spin channel
+    {
+        this->eff_pot_pw_index[iat] = pot_index;
+        pot_index += elem_size;
+    }
 
             for (int l = 0; l <= cell.atoms[it].nwl; l++)
             {
@@ -166,7 +191,13 @@ void Plus_U::init(UnitCell& cell, // unitcell class
         }
     }
     // allocate memory for eff_pot_pw
+    // nspin=2: split layout [all_spin_up | all_spin_down], double the size
+    // nspin=4: each atom already has 4*tlp1^2 (tlp1_npol^2) entries for Pauli blocks
+    if (nspin == 2) pot_index *= 2;
+
     this->eff_pot_pw.resize(pot_index, 0.0);
+    this->uom_array.resize(pot_index, 0.0);
+    this->uom_save.resize(pot_index, 0.0);
 
     if (Yukawa)
     {
@@ -208,7 +239,7 @@ void Plus_U::init(UnitCell& cell, // unitcell class
         this->local_occup_bcast(cell);
 #endif
 
-        initialed_locale = true;
+        mark_locale_initialized();
         this->copy_locale(cell);
     }
     else
@@ -216,12 +247,12 @@ void Plus_U::init(UnitCell& cell, // unitcell class
         if (PARAM.inp.init_chg == "file")
         {
             std::stringstream sst;
-            sst << PARAM.globalv.global_out_dir << "onsite.dm";
+            sst << PARAM.globalv.global_readin_dir << "onsite.dm";
             this->read_occup_m(cell,sst.str());
 #ifdef __MPI
             this->local_occup_bcast(cell);
 #endif
-            initialed_locale = true;
+            mark_locale_initialized();
         }
         else
         {
@@ -240,7 +271,7 @@ void Plus_U::cal_energy_correction(const UnitCell& ucell,
 {
     ModuleBase::TITLE("Plus_U", "cal_energy_correction");
     ModuleBase::timer::start("Plus_U", "cal_energy_correction");
-    if (!initialed_locale)
+    if (!is_locale_initialized())
     {
         ModuleBase::timer::end("Plus_U", "cal_energy_correction");
         return;
@@ -254,7 +285,7 @@ void Plus_U::cal_energy_correction(const UnitCell& ucell,
     for (int T = 0; T < ucell.ntype; T++)
     {
         const int NL = ucell.atoms[T].nwl + 1;
-        const int LC = orbital_corr[T];
+        const int LC = get_orbital_corr(T);
         for (int I = 0; I < ucell.atoms[T].na; I++)
         {
             if (LC == -1)
@@ -263,11 +294,11 @@ void Plus_U::cal_energy_correction(const UnitCell& ucell,
             }
 
             const int iat = ucell.itia2iat(T, I);
-            const int L = orbital_corr[T];
+            const int L = get_orbital_corr(T);
 
             for (int l = 0; l < NL; l++)
             {
-                if (l != orbital_corr[T])
+                if (l != get_orbital_corr(T))
                 {
                     continue;
                 }
diff --git a/source/source_lcao/module_dftu/dftu.h b/source/source_lcao/module_dftu/dftu.h
index 9fa468aca5c..5d8a82851d1 100644
--- a/source/source_lcao/module_dftu/dftu.h
+++ b/source/source_lcao/module_dftu/dftu.h
@@ -4,6 +4,7 @@
 #include "source_cell/klist.h"
 #include "source_cell/unitcell.h"
 #include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_estate/module_charge/charge_mixing.h"
 #ifdef __LCAO
 #include "source_hamilt/hamilt.h"
 #include "source_lcao/module_hcontainer/hcontainer.h"
@@ -61,6 +62,27 @@ class Plus_U
     static double uramping; // increase U by uramping, default is -1.0
     static int omc; // occupation matrix control
     static int mixing_dftu; //whether to mix locale
+    static int nspin;       // spin channel count (1, 2, or 4), set during init
+
+    // --- Accessors for static data (prefer these over direct member access) ---
+
+    /// get Hubbard U for atom type it
+    static double get_hubbard_u(int it) { return U[it]; }
+
+    /// get target Hubbard U0 for atom type it
+    static double get_hubbard_u0(int it) { return U0[it]; }
+
+    /// number of atom types with Hubbard U parameters
+    static int get_num_u_types() { return static_cast<int>(U.size()); }
+
+    /// get correlated orbital angular momentum for atom type it (-1 = none)
+    static int get_orbital_corr(int it) { return orbital_corr[it]; }
+
+    /// whether atom type it has a correlated orbital
+    static bool has_correlated_orbital(int it) { return orbital_corr[it] != -1; }
+
+    /// raw data pointer to orbital_corr (for kernel interfaces)
+    static const int* get_orbital_corr_data() { return orbital_corr.data(); }
 
   private:
 
@@ -113,25 +135,54 @@ class Plus_U
   public:
     /// interface for PW base
 	/// calculate the local occupation number matrix for PW based wave functions
-	void cal_occ_pw(const int iter, 
-			const void* psi_in, 
-			const ModuleBase::matrix& wg_in, 
-			const UnitCell& cell, 
-			const double& mixing_beta);
+	void cal_occ_pw(const int iter,
+			const void* psi_in,
+			const ModuleBase::matrix& wg_in,
+			const UnitCell& cell,
+			Charge_Mixing* p_chgmix,
+			const int* isk);
 
     /// calculate the local DFT+U effective potential matrix for PW base.
     void cal_VU_pot_pw(const int spin);
 
-    /// get effective potential matrix for PW base
-	const std::complex<double>* get_eff_pot_pw(const int iat) const 
-	{ 
-		return &(eff_pot_pw[this->eff_pot_pw_index[iat]]); 
-	}
-
-	int get_size_eff_pot_pw() const 
-	{ 
-		return eff_pot_pw.size(); 
-	}
+    /// get effective potential pointer for the given spin channel (PW basis)
+    ///
+    /// nspin=1: isk is ignored, returns &eff_pot_pw[0]
+    /// nspin=2: isk selects spin-up (0) or spin-down (1) half of the
+    ///          split layout [all_up | all_dn]
+    /// nspin=4: isk is ignored, returns &eff_pot_pw[0] (all Pauli blocks)
+    const std::complex<double>* get_eff_pot_pw_spin(const int isk) const
+    {
+        if (nspin == 2 && isk == 1)
+        {
+            return eff_pot_pw.data() + eff_pot_pw.size() / 2;
+        }
+        return eff_pot_pw.data();
+    }
+
+    /// get size of effective potential for a single spin channel (PW basis)
+    ///
+    /// nspin=1: full array size
+    /// nspin=2: half of the total (one spin channel in split layout)
+    /// nspin=4: full array size (all Pauli blocks are packed together)
+    int get_size_eff_pot_pw_spin() const
+    {
+        return (nspin == 2) ? static_cast<int>(eff_pot_pw.size() / 2)
+                            : static_cast<int>(eff_pot_pw.size());
+    }
+
+    /// get effective potential matrix for PW base (per-atom, raw index)
+    /// @deprecated Use get_eff_pot_pw_spin() for nspin-aware access.
+    [[deprecated("Use get_eff_pot_pw_spin() for nspin-aware access")]]
+    const std::complex<double>* get_eff_pot_pw(const int iat) const
+    {
+        return &(eff_pot_pw[this->eff_pot_pw_index[iat]]);
+    }
+
+    int get_size_eff_pot_pw() const
+    {
+        return eff_pot_pw.size();
+    }
 
 #ifdef __LCAO
     // calculate the local occupation number matrix
@@ -152,6 +203,15 @@ class Plus_U
     // dftu can be calculated only after locale has been initialed
     bool initialed_locale = false;
 
+    // --- Accessors for initialed_locale ---
+    bool is_locale_initialized() const { return initialed_locale; }
+    void mark_locale_initialized() { initialed_locale = true; }
+    void mark_locale_dirty() { initialed_locale = false; }
+
+    // --- Accessors for mixing_dftu ---
+    static bool is_mixing_enabled() { return mixing_dftu != 0; }
+    static void enable_mixing() { mixing_dftu = 1; }
+
   private:
 
     void copy_locale(const UnitCell& ucell);
@@ -160,8 +220,36 @@ class Plus_U
 
     std::vector<std::complex<double>> eff_pot_pw;
     std::vector<int> eff_pot_pw_index;
+    std::vector<double> uom_array;
+    std::vector<double> uom_save;
+
+    void set_locale(const UnitCell& ucell);
 
   public:
+    /// get occupation matrix element locale[iat][l][n][spin](m1,m2)
+    double get_locale(const int iat, const int l, const int n, const int spin,
+                     const int m1, const int m2) const
+    {
+        return locale[iat][l][n][spin](m1, m2);
+    }
+
+    /// set occupation matrix element locale[iat][l][n][spin](m1,m2)
+    void set_locale(const int iat, const int l, const int n, const int spin,
+                   const int m1, const int m2, const double val)
+    {
+        locale[iat][l][n][spin](m1, m2) = val;
+    }
+
+    /// get flat occupation matrix for an atom's correlated orbital.
+    /// nspin=1: fills occ with locale[iat][l][0][0] data
+    /// nspin=2: fills occ with interleaved locale[iat][l][0][0] and [1] data
+    /// nspin=4: fills occ with locale[iat][l][0][0] data (all 4 Pauli blocks)
+    void get_locale_flat(const int iat, const int l, std::vector<double>& occ) const;
+
+    /// set flat occupation matrix for an atom's correlated orbital (write-back)
+    void set_locale_flat(const int iat, const int l, const int spin,
+                        const std::vector<double>& occ);
+
 	// local occupancy matrix of the correlated subspace
     // locale: the out put local occupation number matrix of correlated electrons in the current electronic step
     // locale_save: the input local occupation number matrix of correlated electrons in the current electronic step
diff --git a/source/source_lcao/module_dftu/dftu_force.cpp b/source/source_lcao/module_dftu/dftu_force.cpp
index 7bdce056d3c..a2b6ffca4bf 100644
--- a/source/source_lcao/module_dftu/dftu_force.cpp
+++ b/source/source_lcao/module_dftu/dftu_force.cpp
@@ -252,7 +252,7 @@ void Plus_U::cal_force_k(const UnitCell& ucell,
         for (int it = 0; it < ucell.ntype; it++)
         {
             const int NL = ucell.atoms[it].nwl + 1;
-            const int LC = orbital_corr[it];
+            const int LC = get_orbital_corr(it);
 
             if (LC == -1)
                 continue;
@@ -262,7 +262,7 @@ void Plus_U::cal_force_k(const UnitCell& ucell,
 
                 for (int l = 0; l < NL; l++)
                 {
-                    if (l != orbital_corr[it])
+                    if (l != get_orbital_corr(it))
                         continue;
                     const int N = ucell.atoms[it].l_nchi[l];
 
diff --git a/source/source_lcao/module_dftu/dftu_hamilt.cpp b/source/source_lcao/module_dftu/dftu_hamilt.cpp
index bb7f59a69f4..e2c37039960 100644
--- a/source/source_lcao/module_dftu/dftu_hamilt.cpp
+++ b/source/source_lcao/module_dftu/dftu_hamilt.cpp
@@ -11,7 +11,7 @@ void Plus_U::cal_eff_pot_mat_complex(const int ik,
 		const std::complex<double>* sk)
 {
     ModuleBase::TITLE("Plus_U", "cal_eff_pot_c");
-    if (!this->initialed_locale)
+    if (!is_locale_initialized())
     {
         return;
     }
@@ -64,7 +64,7 @@ void Plus_U::cal_eff_pot_mat_complex(const int ik,
 void Plus_U::cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector<int>& isk, const double* sk)
 {
     ModuleBase::TITLE("Plus_U", "cal_eff_pot_r");
-    if (!this->initialed_locale)
+    if (!is_locale_initialized())
     {
         return;
     }
diff --git a/source/source_lcao/module_dftu/dftu_io.cpp b/source/source_lcao/module_dftu/dftu_io.cpp
index 737c1c590a3..d44113d1be9 100644
--- a/source/source_lcao/module_dftu/dftu_io.cpp
+++ b/source/source_lcao/module_dftu/dftu_io.cpp
@@ -18,9 +18,9 @@ void Plus_U::output(const UnitCell &ucell)
         {
             const int N = ucell.atoms[T].l_nchi[L];
 
-            if (L >= orbital_corr[T] && orbital_corr[T] != -1)
+            if (L >= get_orbital_corr(T) && has_correlated_orbital(T))
             {
-				if (L != orbital_corr[T]) 
+				if (L != get_orbital_corr(T)) 
 				{
 					continue;
 				}
@@ -86,12 +86,12 @@ void Plus_U::write_occup_m(const UnitCell& ucell,
 
     for (int T = 0; T < ucell.ntype; T++)
     {
-		if (orbital_corr[T] == -1) 
+		if (!has_correlated_orbital(T)) 
 		{
 			continue;
 		}
 		const int NL = ucell.atoms[T].nwl + 1;
-        const int LC = orbital_corr[T];
+        const int LC = get_orbital_corr(T);
 
         for (int I = 0; I < ucell.atoms[T].na; I++)
         {
@@ -101,7 +101,7 @@ void Plus_U::write_occup_m(const UnitCell& ucell,
 
             for (int l = 0; l < NL; l++)
             {
-				if (l != orbital_corr[T]) 
+				if (l != get_orbital_corr(T)) 
 				{
 					continue;
 				}
@@ -290,11 +290,11 @@ void Plus_U::read_occup_m(const UnitCell& ucell,
 
             T = ucell.iat2it[iat];
             const int NL = ucell.atoms[T].nwl + 1;
-            const int LC = orbital_corr[T];
+            const int LC = get_orbital_corr(T);
 
             for (int l = 0; l < NL; l++)
             {
-				if (l != orbital_corr[T]) 
+				if (l != get_orbital_corr(T)) 
 				{
 					continue;
 				}
@@ -410,7 +410,7 @@ void Plus_U::local_occup_bcast(const UnitCell& ucell)
 
     for (int T = 0; T < ucell.ntype; T++)
     {
-		if (orbital_corr[T] == -1) 
+		if (!has_correlated_orbital(T)) 
 		{
 			continue;
 		}
@@ -418,11 +418,11 @@ void Plus_U::local_occup_bcast(const UnitCell& ucell)
         for (int I = 0; I < ucell.atoms[T].na; I++)
         {
             const int iat = ucell.itia2iat(T, I);
-            const int L = orbital_corr[T];
+            const int L = get_orbital_corr(T);
 
             for (int l = 0; l <= ucell.atoms[T].nwl; l++)
             {
-				if (l != orbital_corr[T]) 
+				if (l != get_orbital_corr(T)) 
 				{
 					continue;
 				}
diff --git a/source/source_lcao/module_dftu/dftu_occup.cpp b/source/source_lcao/module_dftu/dftu_occup.cpp
index 1babe0cad18..54890acfbe3 100644
--- a/source/source_lcao/module_dftu/dftu_occup.cpp
+++ b/source/source_lcao/module_dftu/dftu_occup.cpp
@@ -6,6 +6,12 @@
 #endif
 #include "source_base/module_external/scalapack_connector.h"
 
+// copy_locale — save current locale to locale_save and uom_save
+//
+// nspin=1: single spin channel, uom_save[eff_pot_pw_index[iat]+mm]
+// nspin=2: split layout — spin-up at uom_save[index+mm],
+//          spin-down at uom_save[half_size+index+mm]
+// nspin=4: all 4 Pauli blocks packed contiguously from index
 void Plus_U::copy_locale(const UnitCell& ucell)
 {
     ModuleBase::TITLE("Plus_U", "copy_locale");
@@ -13,29 +19,40 @@ void Plus_U::copy_locale(const UnitCell& ucell)
 
     for (int T = 0; T < ucell.ntype; T++)
     {
-		if (orbital_corr[T] == -1) 
-		{
-			continue;
-		}
+        int target_l = get_orbital_corr(T);
+        if (target_l == -1)
+            continue;
 
         for (int I = 0; I < ucell.atoms[T].na; I++)
         {
             const int iat = ucell.itia2iat(T, I);
 
-            for (int l = 0; l < ucell.atoms[T].nwl + 1; l++)
+            if (PARAM.inp.nspin == 4)
             {
-                const int N = ucell.atoms[T].l_nchi[l];
-
-                for (int n = 0; n < N; n++)
+                locale_save[iat][target_l][0][0] = locale[iat][target_l][0][0];
+                // nspin=4 locale matrix already contains all spin components interleaved
+                if(this->uom_save.size() != 0)
                 {
-                    if (PARAM.inp.nspin == 4)
+                    const int size = locale[iat][target_l][0][0].nr * locale[iat][target_l][0][0].nc;
+                    for(int mm=0; mm<size; mm++)
                     {
-                        locale_save[iat][l][n][0] = locale[iat][l][n][0];
+                        this->uom_save[eff_pot_pw_index[iat]+mm] = locale[iat][target_l][0][0].c[mm];
                     }
-                    else if (PARAM.inp.nspin == 1 || PARAM.inp.nspin == 2)
+                }
+            }
+            else if (PARAM.inp.nspin == 1 || PARAM.inp.nspin == 2)
+            {
+                locale_save[iat][target_l][0][0] = locale[iat][target_l][0][0];
+                locale_save[iat][target_l][0][1] = locale[iat][target_l][0][1];
+                // save locale matrix for spin=0,1 to uom_save
+                if(this->uom_save.size() != 0)
+                {
+                    const int size = locale[iat][target_l][0][0].nr * locale[iat][target_l][0][0].nc;
+                    const int half_size = this->uom_save.size() / 2;
+                    for(int mm=0; mm<size; mm++)
                     {
-                        locale_save[iat][l][n][0] = locale[iat][l][n][0];
-                        locale_save[iat][l][n][1] = locale[iat][l][n][1];
+                        this->uom_save[eff_pot_pw_index[iat]+mm] = locale[iat][target_l][0][0].c[mm];
+                        this->uom_save[half_size + eff_pot_pw_index[iat]+mm] = locale[iat][target_l][0][1].c[mm];
                     }
                 }
             }
@@ -51,7 +68,7 @@ void Plus_U::zero_locale(const UnitCell& ucell)
 
     for (int T = 0; T < ucell.ntype; T++)
     {
-		if (orbital_corr[T] == -1) 
+		if (!has_correlated_orbital(T)) 
 		{ 
 			continue;
 		}
@@ -92,7 +109,7 @@ void Plus_U::mix_locale(const UnitCell& ucell,
 
     for (int T = 0; T < ucell.ntype; T++)
     {
-		if (orbital_corr[T] == -1) 
+		if (!has_correlated_orbital(T))
 		{
 			continue;
 		}
@@ -123,6 +140,79 @@ void Plus_U::mix_locale(const UnitCell& ucell,
     ModuleBase::timer::end("Plus_U", "mix_locale");
 }
 
+// set_locale — restore locale from uom_array (after mixing)
+//
+// nspin=1: locale[iat][l][n][0] from uom_array[eff_pot_pw_index[iat]+mm]
+// nspin=2: spin-up from uom_array[index+mm],
+//          spin-down from uom_array[half_size+index+mm]
+// nspin=4: all 4 Pauli blocks from uom_array[index+mm], mm in [0, 4*tlp1^2)
+void Plus_U::set_locale(const UnitCell& ucell)
+{
+    ModuleBase::TITLE("Plus_U", "set_locale");
+    ModuleBase::timer::start("Plus_U", "set_locale");
+
+    for (int T = 0; T < ucell.ntype; T++)
+    {
+        if (!has_correlated_orbital(T)) continue;
+        const int l = get_orbital_corr(T);
+        for (int I = 0; I < ucell.atoms[T].na; I++)
+        {
+            const int iat = ucell.itia2iat(T, I);
+            if (PARAM.inp.nspin == 4)
+            {
+                for(int mm = 0; mm < locale[iat][l][0][0].nr * locale[iat][l][0][0].nc; mm++)
+                    locale[iat][l][0][0].c[mm] = this->uom_array[eff_pot_pw_index[iat] + mm];
+            }
+            else if (PARAM.inp.nspin == 1 || PARAM.inp.nspin == 2)
+            {
+                const int half_size = this->uom_array.size() / 2;
+                for(int mm = 0; mm < locale[iat][l][0][0].nr * locale[iat][l][0][0].nc; mm++)
+                {
+                    locale[iat][l][0][0].c[mm] = this->uom_array[eff_pot_pw_index[iat] + mm];
+                    if (PARAM.inp.nspin == 2)
+                    {
+                        locale[iat][l][0][1].c[mm] = this->uom_array[half_size + eff_pot_pw_index[iat] + mm];
+                    }
+                }
+            }
+        }
+    }
+
+    ModuleBase::timer::end("Plus_U", "set_locale");
+}
+
+void Plus_U::get_locale_flat(const int iat, const int l, std::vector<double>& occ) const
+{
+    const int tlp1 = 2 * l + 1;
+    const int size = tlp1 * tlp1;
+    if (nspin == 2)
+    {
+        for (int is = 0; is < 2; is++)
+        {
+            for (int i = 0; i < size; i++)
+            {
+                occ[is * size + i] = locale[iat][l][0][is].c[i];
+            }
+        }
+    }
+    else
+    {
+        for (int i = 0; i < static_cast<int>(occ.size()); i++)
+        {
+            occ[i] = locale[iat][l][0][0].c[i];
+        }
+    }
+}
+
+void Plus_U::set_locale_flat(const int iat, const int l, const int spin,
+                             const std::vector<double>& occ)
+{
+    for (int i = 0; i < static_cast<int>(occ.size()); i++)
+    {
+        locale[iat][l][0][spin].c[i] = occ[i];
+    }
+}
+
 #ifdef __LCAO
 
 void Plus_U::cal_occup_m_k(const int iter, 
@@ -210,7 +300,7 @@ void Plus_U::cal_occup_m_k(const int iter,
         for (int it = 0; it < ucell.ntype; it++)
         {
             const int NL = ucell.atoms[it].nwl + 1;
-            const int LC = orbital_corr[it];
+            const int LC = get_orbital_corr(it);
 
 			if (LC == -1) 
 			{
@@ -223,7 +313,7 @@ void Plus_U::cal_occup_m_k(const int iter,
 
                 for (int l = 0; l < NL; l++)
                 {
-					if (l != orbital_corr[it]) 
+					if (l != get_orbital_corr(it)) 
 					{
 						continue;
 					}
@@ -284,7 +374,7 @@ void Plus_U::cal_occup_m_k(const int iter,
     for (int it = 0; it < ucell.ntype; it++)
     {
         const int NL = ucell.atoms[it].nwl + 1;
-        const int LC = orbital_corr[it];
+        const int LC = get_orbital_corr(it);
 
 		if (LC == -1) 
 		{
@@ -297,7 +387,7 @@ void Plus_U::cal_occup_m_k(const int iter,
 
             for (int l = 0; l < NL; l++)
             {
-				if (l != orbital_corr[it]) 
+				if (l != get_orbital_corr(it)) 
 				{
 					continue;
 				}
@@ -371,12 +461,12 @@ void Plus_U::cal_occup_m_k(const int iter,
         } // end ia
     } // end it
 
-    if(mixing_dftu && initialed_locale)
+    if(is_mixing_enabled() && is_locale_initialized())
     {
         this->mix_locale(ucell,mixing_beta);
     }
 
-    this->initialed_locale = true;
+    mark_locale_initialized();
     ModuleBase::timer::end("Plus_U", "cal_occup_m_k");
     return;
 }
@@ -430,7 +520,7 @@ void Plus_U::cal_occup_m_gamma(const int iter,
         for (int it = 0; it < ucell.ntype; it++)
         {
             const int NL = ucell.atoms[it].nwl + 1;
-			if (orbital_corr[it] == -1) 
+			if (!has_correlated_orbital(it)) 
 			{
 				continue;
 			}
@@ -440,7 +530,7 @@ void Plus_U::cal_occup_m_gamma(const int iter,
 
                 for (int l = 0; l < NL; l++)
                 {
-					if (l != orbital_corr[it]) 
+					if (l != get_orbital_corr(it)) 
 					{
 						continue;
 					}
@@ -529,12 +619,12 @@ void Plus_U::cal_occup_m_gamma(const int iter,
         } // it
     } // is
 
-    if(mixing_dftu && initialed_locale)
+    if(is_mixing_enabled() && is_locale_initialized())
     {
         this->mix_locale(ucell,mixing_beta);
     }
 
-    this->initialed_locale = true;
+    mark_locale_initialized();
     ModuleBase::timer::end("Plus_U", "cal_occup_m_gamma");
     return;
 }
diff --git a/source/source_lcao/module_dftu/dftu_pw.cpp b/source/source_lcao/module_dftu/dftu_pw.cpp
index 7a1a9bac3a6..b09b417cacd 100644
--- a/source/source_lcao/module_dftu/dftu_pw.cpp
+++ b/source/source_lcao/module_dftu/dftu_pw.cpp
@@ -3,14 +3,27 @@
 #include "source_base/parallel_reduce.h"
 #include "source_io/module_parameter/parameter.h"
 #include "source_base/timer.h"
+#include "source_base/parallel_global.h"
 
-
-/// calculate occupation matrix for DFT+U
+/// calculate occupation matrix for DFT+U (PW basis)
+///
+/// nspin=1 (npol=1): single spin channel; locale[iat][l][n][0] only;
+///   eff_pot_pw has one block of tlp1^2 per atom.
+///
+/// nspin=2 (npol=1): two spin channels stored separately:
+///   locale[iat][l][n][0] = spin-up, locale[iat][l][n][1] = spin-down;
+///   becp indices: ib*nkb + begin_ih + m (same formula for both spins);
+///   spin channel selected by `isk[ik]` (not ik >= nk/2, which fails for kpar>1);
+///
+/// nspin=4 (npol=2): spinor calculation;
+///   locale has a single matrix of size (2*tlp1) x (2*tlp1) per atom
+///   storing all 4 Pauli blocks contiguously.
 void Plus_U::cal_occ_pw(const int iter, 
 		const void* psi_in, 
 		const ModuleBase::matrix& wg_in, 
 		const UnitCell& cell, 
-		const double& mixing_beta)
+		Charge_Mixing* p_chgmix,
+		const int* isk)
 {
     ModuleBase::timer::start("Plus_U", "cal_occ_pw");
     this->copy_locale(cell);
@@ -20,58 +33,79 @@ void Plus_U::cal_occ_pw(const int iter,
     {
         auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
         const psi::Psi<std::complex<double>>* psi_p = (const psi::Psi<std::complex<double>>*)psi_in;
-        // loop over k-points to calculate Mi of \sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
         const int nbands = psi_p->get_nbands();
+        const int npol = psi_p->get_npol();
         for(int ik = 0; ik < psi_p->get_nk(); ik++)
         {
+            int is = (PARAM.inp.nspin == 2) ? isk[ik] : 0;
             psi_p->fix_k(ik);
             onsite_p->tabulate_atomic(ik);
 
-            onsite_p->overlap_proj_psi(nbands*psi_p->get_npol(), psi_p->get_pointer());
+            onsite_p->overlap_proj_psi(nbands*npol, psi_p->get_pointer());
             const std::complex<double>* becp = onsite_p->get_h_becp();
-            // becp(nbands*npol , nkb)
-            // mag = wg * \sum_{nh}becp * becp
-            int nkb = onsite_p->get_size_becp() / nbands / psi_p->get_npol();
+            int nkb = onsite_p->get_size_becp() / nbands / npol;
+
             int begin_ih = 0;
             for(int iat = 0; iat < cell.nat; iat++)
             {
                 const int it = cell.iat2it[iat];
                 const int nh = onsite_p->get_nh(iat);
-                const int target_l = this->orbital_corr[it];
-                if(target_l == -1)
+                const int target_l = get_orbital_corr(it);
+                if(!has_correlated_orbital(it))
                 {
                     begin_ih += nh;
                     continue;
                 }
-                // m = l^2, l^2+1, ..., (l+1)^2-1
                 const int m_begin = target_l * target_l;
                 const int tlp1 = 2 * target_l + 1;
                 const int tlp1_2 = tlp1 * tlp1;
-                for(int ib = 0;ib<nbands;ib++)
+                if(PARAM.inp.nspin == 4)
                 {
-                    const double weight = wg_in(ik, ib);
-                    int ind_m1m2 = 0;
-                    for(int m1 = 0; m1 < tlp1; m1++)
+                    for(int ib = 0;ib<nbands;ib++)
                     {
-                        const int index_m1 = ib*2*nkb + begin_ih + m_begin + m1;
-                        for(int m2 = 0; m2 < tlp1; m2++)
+                        const double weight = wg_in(ik, ib);
+                        int ind_m1m2 = 0;
+                        for(int m1 = 0; m1 < tlp1; m1++)
                         {
-                            const int index_m2 = ib*2*nkb + begin_ih + m_begin + m2;
-                            std::complex<double> occ[4];
-                            occ[0] = weight * conj(becp[index_m1]) * becp[index_m2];
-                            occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb];
-                            occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2];
-                            occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb];
-                            this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real();
-                            this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real();
-                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag();
-                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real();
-                            ind_m1m2++;
+                            const int index_m1 = ib*npol*nkb + begin_ih + m_begin + m1;
+                            for(int m2 = 0; m2 < tlp1; m2++)
+                            {
+                                const int index_m2 = ib*npol*nkb + begin_ih + m_begin + m2;
+                                std::complex<double> occ[4];
+                                occ[0] = weight * conj(becp[index_m1]) * becp[index_m2];
+                                occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb];
+                                occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2];
+                                occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb];
+                                this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real();
+                                this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real();
+                                this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag();
+                                this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real();
+                                ind_m1m2++;
+                            }
                         }
-                    }
-                }// ib
+                    }// ib
+                }
+                else // nspin=1 or nspin=2
+                {
+                    for(int ib = 0;ib<nbands;ib++)
+                    {
+                        const double weight = wg_in(ik, ib);
+                        int ind_m1m2 = 0;
+                        for(int m1 = 0; m1 < tlp1; m1++)
+                        {
+                            const int index_m1 = ib*nkb + begin_ih + m_begin + m1;
+                            for(int m2 = 0; m2 < tlp1; m2++)
+                            {
+                                const int index_m2 = ib*nkb + begin_ih + m_begin + m2;
+                                this->locale[iat][target_l][0][is].c[ind_m1m2] += weight * (conj(becp[index_m1]) * becp[index_m2]).real();
+                                ind_m1m2++;
+                            }
+                        }
+                    }// ib
+                }
                 begin_ih += nh;
             }// iat
+
         }// ik
     }
 #if defined(__CUDA) || defined(__ROCM)
@@ -79,141 +113,246 @@ void Plus_U::cal_occ_pw(const int iter,
     {
         auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
         const psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_p = (const psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*)psi_in;
-        // loop over k-points to calculate Mi of \sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
         const int nbands = psi_p->get_nbands();
+        const int npol = psi_p->get_npol();
         for(int ik = 0; ik < psi_p->get_nk(); ik++)
         {
+            int is = (PARAM.inp.nspin == 2) ? isk[ik] : 0;
             psi_p->fix_k(ik);
             onsite_p->tabulate_atomic(ik);
 
-            onsite_p->overlap_proj_psi(nbands*psi_p->get_npol(), psi_p->get_pointer());
+            onsite_p->overlap_proj_psi(nbands*npol, psi_p->get_pointer());
             const std::complex<double>* becp = onsite_p->get_h_becp();
-            // becp(nbands*npol , nkb)
-            // mag = wg * \sum_{nh}becp * becp
-            int nkb = onsite_p->get_size_becp() / nbands / psi_p->get_npol();
+            int nkb = onsite_p->get_size_becp() / nbands / npol;
             int begin_ih = 0;
             for(int iat = 0; iat < cell.nat; iat++)
             {
                 const int it = cell.iat2it[iat];
                 const int nh = onsite_p->get_nh(iat);
-                const int target_l = this->orbital_corr[it];
-                if(target_l == -1)
+                const int target_l = get_orbital_corr(it);
+                if(!has_correlated_orbital(it))
                 {
                     begin_ih += nh;
                     continue;
                 }
-                // m = l^2, l^2+1, ..., (l+1)^2-1
                 const int m_begin = target_l * target_l;
                 const int tlp1 = 2 * target_l + 1;
                 const int tlp1_2 = tlp1 * tlp1;
-                for(int ib = 0;ib<nbands;ib++)
+                if(PARAM.inp.nspin == 4)
                 {
-                    const double weight = wg_in(ik, ib);
-                    int ind_m1m2 = 0;
-                    for(int m1 = 0; m1 < tlp1; m1++)
+                    for(int ib = 0;ib<nbands;ib++)
                     {
-                        const int index_m1 = ib*2*nkb + begin_ih + m_begin + m1;
-                        for(int m2 = 0; m2 < tlp1; m2++)
+                        const double weight = wg_in(ik, ib);
+                        int ind_m1m2 = 0;
+                        for(int m1 = 0; m1 < tlp1; m1++)
                         {
-                            const int index_m2 = ib*2*nkb + begin_ih + m_begin + m2;
-                            std::complex<double> occ[4];
-                            occ[0] = weight * conj(becp[index_m1]) * becp[index_m2];
-                            occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb];
-                            occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2];
-                            occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb];
-                            this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real();
-                            this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real();
-                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag();
-                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real();
-                            ind_m1m2++;
+                            const int index_m1 = ib*npol*nkb + begin_ih + m_begin + m1;
+                            for(int m2 = 0; m2 < tlp1; m2++)
+                            {
+                                const int index_m2 = ib*npol*nkb + begin_ih + m_begin + m2;
+                                std::complex<double> occ[4];
+                                occ[0] = weight * conj(becp[index_m1]) * becp[index_m2];
+                                occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb];
+                                occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2];
+                                occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb];
+                                this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real();
+                                this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real();
+                                this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag();
+                                this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real();
+                                ind_m1m2++;
+                            }
                         }
-                    }
-                }// ib
+                    }// ib
+                }
+                else // nspin=1 or nspin=2
+                {
+                    for(int ib = 0;ib<nbands;ib++)
+                    {
+                        const double weight = wg_in(ik, ib);
+                        int ind_m1m2 = 0;
+                        for(int m1 = 0; m1 < tlp1; m1++)
+                        {
+                            const int index_m1 = ib*nkb + begin_ih + m_begin + m1;
+                            for(int m2 = 0; m2 < tlp1; m2++)
+                            {
+                                const int index_m2 = ib*nkb + begin_ih + m_begin + m2;
+                                this->locale[iat][target_l][0][is].c[ind_m1m2] += weight * (conj(becp[index_m1]) * becp[index_m2]).real();
+                                ind_m1m2++;
+                            }
+                        }
+                    }// ib
+                }
                 begin_ih += nh;
             }// iat
         }// ik
     }
 #endif
 
-    Plus_U::energy_u = 0.0;
-    // reduce mag from all k-pools
+    // reduce locale from all k-pools
     for(int iat = 0; iat < cell.nat; iat++)
     {
         const int it = cell.iat2it[iat];
-        const int target_l = this->orbital_corr[it];
-        if(target_l == -1)
+        const int target_l = get_orbital_corr(it);
+        if(!has_correlated_orbital(it))
         {
             continue;
         }
         const int size = (2 * target_l + 1) * (2 * target_l + 1);
 
-		Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, 
-				PARAM.globalv.nproc_in_pool, 
-				this->locale[iat][target_l][0][0].c, 
-				size * PARAM.inp.nspin);
+        if(PARAM.inp.nspin != 4)
+        {
+            Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, 
+                    GlobalV::NPROC_IN_POOL, 
+                    this->locale[iat][target_l][0][0].c, 
+                    size);
+            if(PARAM.inp.nspin == 2)
+            {
+                Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, 
+                        GlobalV::NPROC_IN_POOL, 
+                        this->locale[iat][target_l][0][1].c, 
+                        size);
+            }
+        }
+        else
+        {
+            Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, 
+                    GlobalV::NPROC_IN_POOL, 
+                    this->locale[iat][target_l][0][0].c, 
+                    size * 4);
+        }
+
+        // save locale matrix for this iat to uom_array
+        if(this->uom_array.size() != 0)
+        {
+            for(int mm=0;mm<size;mm++)
+            {
+                this->uom_array[eff_pot_pw_index[iat]+mm] = this->locale[iat][target_l][0][0].c[mm];
+            }
+            if(PARAM.inp.nspin == 2)
+            {
+                const int half_size = this->uom_array.size() / 2;
+                for(int mm=0;mm<size;mm++)
+                {
+                    this->uom_array[half_size + eff_pot_pw_index[iat]+mm] = this->locale[iat][target_l][0][1].c[mm];
+                }
+            }
+        }
+    }
+
+    // mixing
+    if(is_mixing_enabled() && p_chgmix != nullptr)
+    {
+        p_chgmix->mix_uom(this->uom_array, this->uom_save);
+        this->set_locale(cell);
+    }
+
+    Plus_U::energy_u = 0.0;
+    const double weight_eu = (PARAM.inp.nspin == 1) ? 1.0 : (PARAM.inp.nspin == 2) ? 0.5 : 0.25;
+    const double diag_coeff = (PARAM.inp.nspin == 4) ? 1.0 : 0.5;
+    // calculate VU and energy (locale already reduced above)
+    for(int iat = 0; iat < cell.nat; iat++)
+    {
+        const int it = cell.iat2it[iat];
+        const int target_l = get_orbital_corr(it);
+        if(!has_correlated_orbital(it))
+        {
+            continue;
+        }
+        const int size = (2 * target_l + 1) * (2 * target_l + 1);
 
         //update effective potential
         const double u_value = this->U[it];
         std::complex<double>* vu_iat = &(this->eff_pot_pw[this->eff_pot_pw_index[iat]]);
         const int m_size = 2 * target_l + 1;
-        for (int m1 = 0; m1 < m_size; m1++)
+
+        if(PARAM.inp.nspin == 4)
         {
-            for (int m2 = 0; m2 < m_size; m2++)
+            for (int m1 = 0; m1 < m_size; m1++)
             {
-                vu_iat[m1 * m_size + m2] = u_value * 
-                  (1.0 * (m1 == m2) - this->locale[iat][target_l][0][0].c[m2 * m_size + m1]);
-                Plus_U::energy_u += u_value * 0.25 * this->locale[iat][target_l][0][0].c[m2 * m_size + m1] 
-                         * this->locale[iat][target_l][0][0].c[m1 * m_size + m2];
+                for (int m2 = 0; m2 < m_size; m2++)
+                {
+                    vu_iat[m1 * m_size + m2] = u_value * 
+                      (diag_coeff * (m1 == m2) - this->locale[iat][target_l][0][0].c[m2 * m_size + m1]);
+                    Plus_U::energy_u += u_value * weight_eu * this->locale[iat][target_l][0][0].c[m2 * m_size + m1] 
+                             * this->locale[iat][target_l][0][0].c[m1 * m_size + m2];
+                }
             }
-        }
-        for (int is = 1; is < 4; ++is)
-        {
-            int start = is * m_size * m_size;
+            for (int is = 1; is < 4; ++is)
+            {
+                int start = is * m_size * m_size;
+                for (int m1 = 0; m1 < m_size; m1++)
+                {
+                    for (int m2 = 0; m2 < m_size; m2++)
+                    {
+                        vu_iat[start + m1 * m_size + m2] = u_value * 
+                          (0 - this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1]);
+                        Plus_U::energy_u += u_value * weight_eu 
+                                 * this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1] 
+                                 * this->locale[iat][target_l][0][0].c[start + m1 * m_size + m2];
+                    }
+                }
+            }
+            // transfer from Pauli matrix representation to spin representation 
             for (int m1 = 0; m1 < m_size; m1++)
             {
                 for (int m2 = 0; m2 < m_size; m2++)
                 {
-                    vu_iat[start + m1 * m_size + m2] = u_value * 
-                      (0 - this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1]);
-                    Plus_U::energy_u += u_value * 0.25 
-                             * this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1] 
-                             * this->locale[iat][target_l][0][0].c[start + m1 * m_size + m2];
+                    int index[4];
+                    index[0] = m1 * m_size + m2;
+                    index[1] = m1 * m_size + m2 + size;
+                    index[2] = m1 * m_size + m2 + size * 2;
+                    index[3] = m1 * m_size + m2 + size * 3;
+                    std::complex<double> vu_tmp[4];
+                    for (int i = 0; i < 4; i++)
+                    {
+                        vu_tmp[i] = vu_iat[index[i]];
+                    }
+                    vu_iat[index[0]] = 0.5 * (vu_tmp[0] + vu_tmp[3]);
+                    vu_iat[index[3]] = 0.5 * (vu_tmp[0] - vu_tmp[3]);
+                    vu_iat[index[1]] = 0.5 * (vu_tmp[1] + std::complex<double>(0.0, 1.0) * vu_tmp[2]);
+                    vu_iat[index[2]] = 0.5 * (vu_tmp[1] - std::complex<double>(0.0, 1.0) * vu_tmp[2]);
                 }
             }
         }
-        // transfer from Pauli matrix representation to spin representation 
-        for (int m1 = 0; m1 < m_size; m1++)
+        else // nspin=1 or nspin=2
         {
-            for (int m2 = 0; m2 < m_size; m2++)
+            // spin-up channel
+            for (int m1 = 0; m1 < m_size; m1++)
+            {
+                for (int m2 = 0; m2 < m_size; m2++)
+                {
+                    vu_iat[m1 * m_size + m2] = u_value * 
+                      (diag_coeff * (m1 == m2) - this->locale[iat][target_l][0][0].c[m2 * m_size + m1]);
+                    Plus_U::energy_u += u_value * weight_eu * this->locale[iat][target_l][0][0].c[m2 * m_size + m1] 
+                             * this->locale[iat][target_l][0][0].c[m1 * m_size + m2];
+                }
+            }
+            // spin-down channel for nspin=2
+            if(PARAM.inp.nspin == 2)
             {
-                int index[4];
-                index[0] = m1 * m_size + m2;
-                index[1] = m1 * m_size + m2 + size;
-                index[2] = m1 * m_size + m2 + size * 2;
-                index[3] = m1 * m_size + m2 + size * 3;
-                std::complex<double> vu_tmp[4];
-                for (int i = 0; i < 4; i++)
+                std::complex<double>* vu_iat1 = &(this->eff_pot_pw[this->eff_pot_pw.size()/2 + this->eff_pot_pw_index[iat]]);
+                for (int m1 = 0; m1 < m_size; m1++)
                 {
-                    vu_tmp[i] = vu_iat[index[i]];
+                    for (int m2 = 0; m2 < m_size; m2++)
+                    {
+                        vu_iat1[m1 * m_size + m2] = u_value * 
+                          (diag_coeff * (m1 == m2) - this->locale[iat][target_l][0][1].c[m2 * m_size + m1]);
+                        Plus_U::energy_u += u_value * weight_eu * this->locale[iat][target_l][0][1].c[m2 * m_size + m1] 
+                                 * this->locale[iat][target_l][0][1].c[m1 * m_size + m2];
+                    }
                 }
-                vu_iat[index[0]] = 0.5 * (vu_tmp[0] + vu_tmp[3]);
-                vu_iat[index[3]] = 0.5 * (vu_tmp[0] - vu_tmp[3]);
-                vu_iat[index[1]] = 0.5 * (vu_tmp[1] + std::complex<double>(0.0, 1.0) * vu_tmp[2]);
-                vu_iat[index[2]] = 0.5 * (vu_tmp[1] - std::complex<double>(0.0, 1.0) * vu_tmp[2]);
             }
         }
     }
 
-    if(mixing_dftu && initialed_locale)
-    {
-        this->mix_locale(cell, mixing_beta);
-    }
-    // update effective potential
     ModuleBase::timer::end("Plus_U", "cal_occ_pw");
 }
 /// calculate the local DFT+U effective potential matrix for PW base.
+/// TODO: implement VU potential calculation for PW basis
 void Plus_U::cal_VU_pot_pw(const int spin)
 {
-
+    // Placeholder: VU potential for PW is computed via cal_eff_pot_mat_* in the
+    // onsite projector path. This function is reserved for future direct-PW implementation.
+    (void)spin;
 }
 
diff --git a/source/source_lcao/module_dftu/test/CMakeLists.txt b/source/source_lcao/module_dftu/test/CMakeLists.txt
new file mode 100644
index 00000000000..82d179d52b3
--- /dev/null
+++ b/source/source_lcao/module_dftu/test/CMakeLists.txt
@@ -0,0 +1,5 @@
+AddTest(
+  TARGET dftu_pw_test
+  LIBS ${math_libs} base device parameter
+  SOURCES dftu_pw_test.cpp
+)
diff --git a/source/source_lcao/module_dftu/test/dftu_pw_test.cpp b/source/source_lcao/module_dftu/test/dftu_pw_test.cpp
new file mode 100644
index 00000000000..5fd0083861c
--- /dev/null
+++ b/source/source_lcao/module_dftu/test/dftu_pw_test.cpp
@@ -0,0 +1,1057 @@
+#include "gtest/gtest.h"
+#include <complex>
+#define private public
+#include "source_io/module_parameter/parameter.h"
+#undef private
+
+/***********************************************************************
+ * Unit tests for DFT+U PW nspin=1/2/4 support (PR-2)
+ *
+ * Strategy: test energy weights and becp index logic as pure
+ * arithmetic — no need to link against full ABACUS libraries.
+ * set_locale is tested via integration tests.
+ ***********************************************************************/
+
+class DftuPwTest : public ::testing::Test
+{
+  protected:
+    void SetUp() override {}
+    void TearDown() override {}
+};
+
+// =====================================================================
+// Energy weight tests
+// =====================================================================
+
+TEST_F(DftuPwTest, EnergyWeightsNspin1)
+{
+    PARAM.input.nspin = 1;
+    double weight_eu = 1;
+    switch(PARAM.inp.nspin)
+    {
+        case 1: weight_eu = 1.0; break;
+        case 2: weight_eu = 0.5; break;
+        case 4: weight_eu = 0.25; break;
+        default: break;
+    }
+    const double diag_coeff = PARAM.inp.nspin == 4 ? 1.0 : 0.5;
+    EXPECT_DOUBLE_EQ(weight_eu, 1.0);
+    EXPECT_DOUBLE_EQ(diag_coeff, 0.5);
+}
+
+TEST_F(DftuPwTest, EnergyWeightsNspin2)
+{
+    PARAM.input.nspin = 2;
+    double weight_eu = 1;
+    switch(PARAM.inp.nspin)
+    {
+        case 1: weight_eu = 1.0; break;
+        case 2: weight_eu = 0.5; break;
+        case 4: weight_eu = 0.25; break;
+        default: break;
+    }
+    const double diag_coeff = PARAM.inp.nspin == 4 ? 1.0 : 0.5;
+    EXPECT_DOUBLE_EQ(weight_eu, 0.5);
+    EXPECT_DOUBLE_EQ(diag_coeff, 0.5);
+}
+
+TEST_F(DftuPwTest, EnergyWeightsNspin4)
+{
+    PARAM.input.nspin = 4;
+    double weight_eu = 1;
+    switch(PARAM.inp.nspin)
+    {
+        case 1: weight_eu = 1.0; break;
+        case 2: weight_eu = 0.5; break;
+        case 4: weight_eu = 0.25; break;
+        default: break;
+    }
+    const double diag_coeff = PARAM.inp.nspin == 4 ? 1.0 : 0.5;
+    EXPECT_DOUBLE_EQ(weight_eu, 0.25);
+    EXPECT_DOUBLE_EQ(diag_coeff, 1.0);
+}
+
+// =====================================================================
+// Becp index tests
+// =====================================================================
+
+TEST_F(DftuPwTest, OccupNspin12Index)
+{
+    const int nkb = 10, begin_ih = 3, m_begin = 4, m = 2, ib = 5;
+    // nspin=1/2: index = ib*nkb + begin_ih + m_begin + m
+    const int index_nspin12 = ib * nkb + begin_ih + m_begin + m;
+    EXPECT_EQ(index_nspin12, 59);
+    // different from nspin=4
+    const int index_nspin4 = ib * 2 * nkb + begin_ih + m_begin + m;
+    EXPECT_NE(index_nspin12, index_nspin4);
+}
+
+TEST_F(DftuPwTest, OccupNspin4Index)
+{
+    const int nkb = 10, begin_ih = 3, m_begin = 4, m = 2, ib = 5;
+    const int index_nspin4 = ib * 2 * nkb + begin_ih + m_begin + m;
+    EXPECT_EQ(index_nspin4, 109);
+}
+
+// =====================================================================
+// set_locale logic tests (pure array copy, no UnitCell needed)
+// =====================================================================
+
+TEST_F(DftuPwTest, SetLocaleNspin4)
+{
+    // Simulate set_locale for nspin=4: uom_array -> locale copy
+    PARAM.input.nspin = 4;
+    const int mat_size = 10; // (2*2+1)*2 for d-orbital with npol=2
+    const int total = mat_size * mat_size; // 100
+
+    std::vector<double> uom_array(total);
+    for(int i = 0; i < total; i++)
+        uom_array[i] = static_cast<double>(i + 1);
+
+    // Simulate locale as raw array (same as ModuleBase::matrix::c)
+    std::vector<double> locale_c(total, 0.0);
+
+    // nspin=4 branch: direct copy
+    for(int mm = 0; mm < total; mm++)
+        locale_c[mm] = uom_array[mm];
+
+    for(int i = 0; i < total; i++)
+        EXPECT_DOUBLE_EQ(locale_c[i], static_cast<double>(i + 1));
+}
+
+TEST_F(DftuPwTest, SetLocaleNspin2)
+{
+    // Simulate set_locale for nspin=2: uom_array -> locale copy (spin-up + spin-down)
+    PARAM.input.nspin = 2;
+    const int mat_size = 5; // 2*2+1 for d-orbital
+    const int size_per_spin = mat_size * mat_size; // 25
+    const int total = size_per_spin * 2; // 50
+
+    std::vector<double> uom_array(total);
+    for(int i = 0; i < size_per_spin; i++)
+    {
+        uom_array[i] = static_cast<double>(i + 1);                // spin-up
+        uom_array[i + size_per_spin] = static_cast<double>(i + 101); // spin-down
+    }
+
+    std::vector<double> locale_up(size_per_spin, 0.0);
+    std::vector<double> locale_dn(size_per_spin, 0.0);
+
+    // nspin=1/2 branch: copy both spin channels
+    const int nr_nc = size_per_spin; // locale[iat][l][0][0].nr * locale[iat][l][0][0].nc
+    for(int mm = 0; mm < nr_nc; mm++)
+    {
+        locale_up[mm] = uom_array[mm];
+        locale_dn[mm] = uom_array[mm + nr_nc];
+    }
+
+    for(int i = 0; i < size_per_spin; i++)
+    {
+        EXPECT_DOUBLE_EQ(locale_up[i], static_cast<double>(i + 1));
+        EXPECT_DOUBLE_EQ(locale_dn[i], static_cast<double>(i + 101));
+    }
+}
+
+// =====================================================================
+// VU effective potential tests (cal_occ_pw logic)
+// =====================================================================
+
+TEST_F(DftuPwTest, VUPotNspin1_DiagonalLocale)
+{
+    // For nspin=1: VU[m1,m2] = U * (0.5*delta(m1,m2) - locale[m2*m_size+m1])
+    // With diagonal locale: locale[m,m] = 0.3
+    const double U_val = 4.0;
+    const int m_size = 5; // d-orbital: 2*2+1
+    const int size = m_size * m_size;
+
+    std::vector<double> locale_c(size, 0.0);
+    for(int m = 0; m < m_size; m++)
+        locale_c[m * m_size + m] = 0.3; // diagonal
+
+    std::vector<std::complex<double>> vu(size, {0.0, 0.0});
+    for(int m1 = 0; m1 < m_size; m1++)
+    {
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            const double diag_coeff = 0.5; // nspin != 4
+            vu[m1 * m_size + m2] = U_val *
+                (diag_coeff * (m1 == m2) - locale_c[m2 * m_size + m1]);
+        }
+    }
+
+    // diagonal: U*(0.5 - 0.3) = 4.0*0.2 = 0.8
+    for(int m = 0; m < m_size; m++)
+        EXPECT_DOUBLE_EQ(vu[m * m_size + m].real(), 0.8);
+
+    // off-diagonal: U*(0 - 0) = 0
+    EXPECT_DOUBLE_EQ(vu[0 * m_size + 1].real(), 0.0);
+    EXPECT_DOUBLE_EQ(vu[1 * m_size + 0].real(), 0.0);
+}
+
+TEST_F(DftuPwTest, VUPotNspin1_OffDiagonalLocale)
+{
+    // locale has off-diagonal elements
+    const double U_val = 3.0;
+    const int m_size = 3; // p-orbital: 2*1+1
+    const int size = m_size * m_size;
+
+    std::vector<double> locale_c(size, 0.0);
+    locale_c[0 * m_size + 1] = 0.1; // locale(0,1) = 0.1
+    locale_c[1 * m_size + 0] = 0.2; // locale(1,0) = 0.2
+
+    std::vector<std::complex<double>> vu(size, {0.0, 0.0});
+    for(int m1 = 0; m1 < m_size; m1++)
+    {
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            vu[m1 * m_size + m2] = U_val *
+                (0.5 * (m1 == m2) - locale_c[m2 * m_size + m1]);
+        }
+    }
+
+    // VU[0,1] = U * (0 - locale[1*3+0]) = 3.0 * (-0.2) = -0.6
+    EXPECT_DOUBLE_EQ(vu[0 * m_size + 1].real(), -0.6);
+    // VU[1,0] = U * (0 - locale[0*3+1]) = 3.0 * (-0.1) = -0.3
+    EXPECT_DOUBLE_EQ(vu[1 * m_size + 0].real(), -0.3);
+}
+
+TEST_F(DftuPwTest, VUPotNspin2_TwoSpinChannels)
+{
+    // nspin=2: two independent spin channels with same formula
+    const double U_val = 5.0;
+    const int m_size = 3;
+    const int size = m_size * m_size;
+
+    std::vector<double> locale_up(size, 0.0);
+    std::vector<double> locale_dn(size, 0.0);
+    locale_up[0] = 0.4; // locale_up(0,0) = 0.4
+    locale_dn[0] = 0.1; // locale_dn(0,0) = 0.1
+
+    // VU_up[0,0] = U*(0.5 - 0.4) = 0.5
+    double vu_up_00 = U_val * (0.5 - locale_up[0 * m_size + 0]);
+    EXPECT_DOUBLE_EQ(vu_up_00, 0.5);
+
+    // VU_dn[0,0] = U*(0.5 - 0.1) = 2.0
+    double vu_dn_00 = U_val * (0.5 - locale_dn[0 * m_size + 0]);
+    EXPECT_DOUBLE_EQ(vu_dn_00, 2.0);
+}
+
+TEST_F(DftuPwTest, VUPotNspin4_PauliTransform)
+{
+    // nspin=4: after computing VU in Pauli basis, transform to spin basis
+    // vu_spin[0] = 0.5*(vu_pauli[0] + vu_pauli[3])
+    // vu_spin[3] = 0.5*(vu_pauli[0] - vu_pauli[3])
+    // vu_spin[1] = 0.5*(vu_pauli[1] + i*vu_pauli[2])
+    // vu_spin[2] = 0.5*(vu_pauli[1] - i*vu_pauli[2])
+    const int m_size = 3;
+    const int size = m_size * m_size;
+
+    // For a single (m1,m2) pair, test the Pauli->spin transform
+    std::complex<double> vu_pauli[4];
+    vu_pauli[0] = {1.0, 0.0}; // charge channel
+    vu_pauli[1] = {0.5, 0.0}; // sigma_x
+    vu_pauli[2] = {0.3, 0.0}; // sigma_y
+    vu_pauli[3] = {0.2, 0.0}; // sigma_z
+
+    std::complex<double> vu_spin[4];
+    vu_spin[0] = 0.5 * (vu_pauli[0] + vu_pauli[3]);
+    vu_spin[3] = 0.5 * (vu_pauli[0] - vu_pauli[3]);
+    vu_spin[1] = 0.5 * (vu_pauli[1] + std::complex<double>(0.0, 1.0) * vu_pauli[2]);
+    vu_spin[2] = 0.5 * (vu_pauli[1] - std::complex<double>(0.0, 1.0) * vu_pauli[2]);
+
+    EXPECT_DOUBLE_EQ(vu_spin[0].real(), 0.6);  // 0.5*(1.0+0.2)
+    EXPECT_DOUBLE_EQ(vu_spin[0].imag(), 0.0);
+    EXPECT_DOUBLE_EQ(vu_spin[3].real(), 0.4);  // 0.5*(1.0-0.2)
+    EXPECT_DOUBLE_EQ(vu_spin[3].imag(), 0.0);
+    EXPECT_DOUBLE_EQ(vu_spin[1].real(), 0.25); // 0.5*0.5
+    EXPECT_DOUBLE_EQ(vu_spin[1].imag(), 0.15); // 0.5*0.3
+    EXPECT_DOUBLE_EQ(vu_spin[2].real(), 0.25); // 0.5*0.5
+    EXPECT_DOUBLE_EQ(vu_spin[2].imag(), -0.15);// -0.5*0.3
+}
+
+// =====================================================================
+// Energy calculation tests
+// =====================================================================
+
+TEST_F(DftuPwTest, EnergyNspin1_DiagonalLocale)
+{
+    // E_U = sum_{m1,m2} U * weight_eu * locale[m2,m1] * locale[m1,m2]
+    // weight_eu = 1.0 for nspin=1
+    const double U_val = 4.0;
+    const int m_size = 3;
+    const int size = m_size * m_size;
+
+    std::vector<double> locale_c(size, 0.0);
+    locale_c[0 * m_size + 0] = 0.5;
+    locale_c[1 * m_size + 1] = 0.3;
+    locale_c[2 * m_size + 2] = 0.2;
+
+    double energy_u = 0.0;
+    const double weight_eu = 1.0;
+    for(int m1 = 0; m1 < m_size; m1++)
+    {
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            energy_u += U_val * weight_eu * locale_c[m2 * m_size + m1]
+                        * locale_c[m1 * m_size + m2];
+        }
+    }
+
+    // Only diagonal contributes: U * (0.5^2 + 0.3^2 + 0.2^2) = 4*(0.25+0.09+0.04) = 4*0.38 = 1.52
+    EXPECT_DOUBLE_EQ(energy_u, 1.52);
+}
+
+TEST_F(DftuPwTest, EnergyNspin2_TwoChannels)
+{
+    // nspin=2: weight_eu = 0.5, sum over both spin channels
+    const double U_val = 2.0;
+    const int m_size = 3;
+    const int size = m_size * m_size;
+    const double weight_eu = 0.5;
+
+    std::vector<double> locale_up(size, 0.0);
+    std::vector<double> locale_dn(size, 0.0);
+    locale_up[0] = 0.4; // (0,0)
+    locale_dn[0] = 0.6; // (0,0)
+
+    double energy_u = 0.0;
+    // spin-up contribution
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            energy_u += U_val * weight_eu * locale_up[m2 * m_size + m1] * locale_up[m1 * m_size + m2];
+    // spin-down contribution
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            energy_u += U_val * weight_eu * locale_dn[m2 * m_size + m1] * locale_dn[m1 * m_size + m2];
+
+    // U*0.5*(0.4^2 + 0.6^2) = 2*0.5*(0.16+0.36) = 0.52
+    EXPECT_DOUBLE_EQ(energy_u, 0.52);
+}
+
+TEST_F(DftuPwTest, EnergyNspin4_WithOffDiagonal)
+{
+    // nspin=4: weight_eu = 0.25, includes off-diagonal Pauli components
+    const double U_val = 2.0;
+    const int m_size = 2; // simplified: s-orbital would be 1, use 2 for test
+    const int size = m_size * m_size;
+    const double weight_eu = 0.25;
+
+    // 4 Pauli components stored contiguously
+    std::vector<double> locale_c(size * 4, 0.0);
+    // charge channel (is=0)
+    locale_c[0] = 0.5; locale_c[1] = 0.1;
+    locale_c[2] = 0.1; locale_c[3] = 0.5;
+    // sigma_x (is=1)
+    locale_c[size + 0] = 0.2; locale_c[size + 1] = 0.0;
+    locale_c[size + 2] = 0.0; locale_c[size + 3] = 0.2;
+
+    double energy_u = 0.0;
+    for(int is = 0; is < 4; is++)
+    {
+        int start = is * size;
+        for(int m1 = 0; m1 < m_size; m1++)
+        {
+            for(int m2 = 0; m2 < m_size; m2++)
+            {
+                energy_u += U_val * weight_eu
+                    * locale_c[start + m2 * m_size + m1]
+                    * locale_c[start + m1 * m_size + m2];
+            }
+        }
+    }
+
+    // is=0: 2*0.25*(0.5*0.5 + 0.1*0.1 + 0.1*0.1 + 0.5*0.5) = 0.5*(0.25+0.01+0.01+0.25) = 0.26
+    // is=1: 2*0.25*(0.2*0.2 + 0 + 0 + 0.2*0.2) = 0.5*(0.04+0.04) = 0.04
+    // is=2,3: 0
+    EXPECT_DOUBLE_EQ(energy_u, 0.30);
+}
+
+// =====================================================================
+// Locale accumulation from becp (cal_occ_pw core loop)
+// =====================================================================
+
+TEST_F(DftuPwTest, LocaleAccumNspin12)
+{
+    // nspin=1/2: locale[m1*m_size+m2] += weight * real(conj(becp[m1]) * becp[m2])
+    const int m_size = 3; // p-orbital
+    const int nkb = 5;
+    const int begin_ih = 0;
+    const int m_begin = 0; // target_l=1, m_begin = 1*1 = 1... but for test simplicity use 0
+    const int nbands = 2;
+    const double weights[2] = {1.0, 0.5};
+
+    // becp array: becp[ib*nkb + begin_ih + m_begin + m]
+    std::vector<std::complex<double>> becp(nbands * nkb, {0.0, 0.0});
+    // band 0
+    becp[0 * nkb + 0] = {1.0, 0.0};
+    becp[0 * nkb + 1] = {0.0, 1.0};
+    becp[0 * nkb + 2] = {0.5, 0.5};
+    // band 1
+    becp[1 * nkb + 0] = {0.5, 0.0};
+    becp[1 * nkb + 1] = {0.5, -0.5};
+    becp[1 * nkb + 2] = {0.0, 1.0};
+
+    std::vector<double> locale_c(m_size * m_size, 0.0);
+    for(int ib = 0; ib < nbands; ib++)
+    {
+        const double weight = weights[ib];
+        int ind_m1m2 = 0;
+        for(int m1 = 0; m1 < m_size; m1++)
+        {
+            const int index_m1 = ib * nkb + begin_ih + m_begin + m1;
+            for(int m2 = 0; m2 < m_size; m2++)
+            {
+                const int index_m2 = ib * nkb + begin_ih + m_begin + m2;
+                locale_c[ind_m1m2] += weight * (std::conj(becp[index_m1]) * becp[index_m2]).real();
+                ind_m1m2++;
+            }
+        }
+    }
+
+    // band0, w=1.0: conj(becp0)*becp0 = |1|^2=1, conj(becp0)*becp1 = 1*(0,1)=(0,1)->real=0
+    // locale[0,0] from band0 = 1.0*1.0 = 1.0
+    // band1, w=0.5: conj(becp0)*becp0 = |0.5|^2=0.25
+    // locale[0,0] from band1 = 0.5*0.25 = 0.125
+    EXPECT_DOUBLE_EQ(locale_c[0], 1.125); // 1.0 + 0.125
+
+    // locale[1,1]: band0 = 1.0*|i|^2 = 1.0, band1 = 0.5*|(0.5,-0.5)|^2 = 0.5*0.5 = 0.25
+    EXPECT_DOUBLE_EQ(locale_c[4], 1.25);
+}
+
+TEST_F(DftuPwTest, LocaleAccumNspin4_PauliComponents)
+{
+    // nspin=4: 4 Pauli components from becp with npol=2
+    // occ[0] = w * conj(becp_up[m1]) * becp_up[m2]
+    // occ[1] = w * conj(becp_up[m1]) * becp_dn[m2]
+    // occ[2] = w * conj(becp_dn[m1]) * becp_up[m2]
+    // occ[3] = w * conj(becp_dn[m1]) * becp_dn[m2]
+    // locale[ind] += (occ[0]+occ[3]).real()       -- charge
+    // locale[ind+size] += (occ[1]+occ[2]).real()   -- sigma_x
+    // locale[ind+2*size] += (occ[1]-occ[2]).imag() -- sigma_y
+    // locale[ind+3*size] += (occ[0]-occ[3]).real() -- sigma_z
+
+    const int m_size = 1; // s-orbital for simplicity
+    const int nkb = 2;
+    const int nbands = 1;
+    const double weight = 1.0;
+
+    // becp layout: becp[ib*2*nkb + begin_ih + m]  (up)
+    //              becp[ib*2*nkb + begin_ih + m + nkb] (down)
+    std::vector<std::complex<double>> becp(nbands * 2 * nkb, {0.0, 0.0});
+    // m=0 only (s-orbital)
+    becp[0 * 2 * nkb + 0] = {0.8, 0.0};       // becp_up[m=0]
+    becp[0 * 2 * nkb + 0 + nkb] = {0.0, 0.6}; // becp_dn[m=0]
+
+    const int size = m_size * m_size; // 1
+    std::vector<double> locale_c(size * 4, 0.0);
+
+    for(int ib = 0; ib < nbands; ib++)
+    {
+        int ind_m1m2 = 0;
+        for(int m1 = 0; m1 < m_size; m1++)
+        {
+            const int index_m1 = ib * 2 * nkb + 0 + m1;
+            for(int m2 = 0; m2 < m_size; m2++)
+            {
+                const int index_m2 = ib * 2 * nkb + 0 + m2;
+                std::complex<double> occ[4];
+                occ[0] = weight * std::conj(becp[index_m1]) * becp[index_m2];
+                occ[1] = weight * std::conj(becp[index_m1]) * becp[index_m2 + nkb];
+                occ[2] = weight * std::conj(becp[index_m1 + nkb]) * becp[index_m2];
+                occ[3] = weight * std::conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb];
+                locale_c[ind_m1m2] += (occ[0] + occ[3]).real();
+                locale_c[ind_m1m2 + size] += (occ[1] + occ[2]).real();
+                locale_c[ind_m1m2 + 2 * size] += (occ[1] - occ[2]).imag();
+                locale_c[ind_m1m2 + 3 * size] += (occ[0] - occ[3]).real();
+                ind_m1m2++;
+            }
+        }
+    }
+
+    // becp_up = (0.8, 0), becp_dn = (0, 0.6)
+    // occ[0] = conj(0.8)*0.8 = 0.64
+    // occ[1] = conj(0.8)*(0,0.6) = 0.8*(0,0.6) = (0, 0.48)
+    // occ[2] = conj(0,0.6)*0.8 = (0,-0.6)*0.8 = (0, -0.48)
+    // occ[3] = conj(0,0.6)*(0,0.6) = (0,-0.6)*(0,0.6) = 0.36
+    EXPECT_DOUBLE_EQ(locale_c[0], 1.0);    // (0.64+0.36).real = 1.0 (charge)
+    EXPECT_DOUBLE_EQ(locale_c[1], 0.0);    // (occ1+occ2).real = ((0,0.48)+(0,-0.48)).real = 0
+    EXPECT_DOUBLE_EQ(locale_c[2], 0.96);   // (occ1-occ2).imag = ((0,0.48)-(0,-0.48)).imag = 0.96
+    EXPECT_DOUBLE_EQ(locale_c[3], 0.28);   // (occ0-occ3).real = (0.64-0.36) = 0.28 (sigma_z)
+}
+
+TEST_F(DftuPwTest, CopyLocaleToUomSave_Nspin2)
+{
+    // Verify copy_locale logic for split layout: [all_up | all_dn]
+    const int m_size = 3;
+    const int size = m_size * m_size;
+
+    std::vector<double> locale_spin0(size), locale_spin1(size);
+    for(int i = 0; i < size; i++)
+    {
+        locale_spin0[i] = static_cast<double>(i + 1);
+        locale_spin1[i] = static_cast<double>(i + 100);
+    }
+
+    std::vector<double> uom_save(size * 2, 0.0);
+    const int eff_pot_index = 0;
+    const int half_size = uom_save.size() / 2;
+    for(int mm = 0; mm < size; mm++)
+    {
+        uom_save[eff_pot_index + mm] = locale_spin0[mm];
+        uom_save[half_size + eff_pot_index + mm] = locale_spin1[mm];
+    }
+
+    for(int i = 0; i < size; i++)
+    {
+        EXPECT_DOUBLE_EQ(uom_save[i], static_cast<double>(i + 1));
+        EXPECT_DOUBLE_EQ(uom_save[half_size + i], static_cast<double>(i + 100));
+    }
+}
+
+TEST_F(DftuPwTest, CopyLocaleToUomSave_Nspin4)
+{
+    // nspin=4: 4 blocks stored contiguously
+    const int m_size = 3;
+    const int size = m_size * m_size;
+    const int total = size * 4; // 4 Pauli components
+
+    std::vector<double> locale_c(total);
+    for(int i = 0; i < total; i++)
+        locale_c[i] = static_cast<double>(i + 1);
+
+    std::vector<double> uom_save(total, 0.0);
+    const int eff_pot_index = 0;
+    for(int mm = 0; mm < size; mm++)
+    {
+        uom_save[eff_pot_index + mm] = locale_c[mm];
+        uom_save[eff_pot_index + mm + size] = locale_c[mm + size];
+        uom_save[eff_pot_index + mm + 2 * size] = locale_c[mm + 2 * size];
+        uom_save[eff_pot_index + mm + 3 * size] = locale_c[mm + 3 * size];
+    }
+
+    for(int i = 0; i < total; i++)
+        EXPECT_DOUBLE_EQ(uom_save[i], static_cast<double>(i + 1));
+}
+
+// =====================================================================
+// Step 1: VU calculation test for nspin=2 (isolated from kernel)
+// This tests the complete cal_occ_pw vu calculation path:
+// becp -> locale -> vu_up/vu_dn
+// =====================================================================
+
+TEST_F(DftuPwTest, VU_Calculation_Nspin2_FullPath)
+{
+    // Simulate complete vu calculation for nspin=2
+    // This is the EXACT logic from cal_occ_pw, isolated from kernel
+
+    const int m_size = 5; // d-orbital: 2*2+1
+    const int size = m_size * m_size; // 25
+    const double U_val = 5.0;
+    const double weight_eu = 0.5; // nspin=2
+    const double diag_coeff = 0.5;
+
+    // Simulated locale values (would normally come from becp accumulation)
+    std::vector<double> locale_up(size, 0.0);
+    std::vector<double> locale_dn(size, 0.0);
+    // Set diagonal values typical for occupied d-orbitals
+    for(int m = 0; m < m_size; m++)
+    {
+        locale_up[m * m_size + m] = 0.8;
+        locale_dn[m * m_size + m] = 0.2;
+    }
+
+    // Calculate VU for spin-up
+    std::vector<std::complex<double>> vu_up(size, {0.0, 0.0});
+    for(int m1 = 0; m1 < m_size; m1++)
+    {
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            vu_up[m1 * m_size + m2] = U_val *
+                (diag_coeff * (m1 == m2) - locale_up[m2 * m_size + m1]);
+        }
+    }
+
+    // Calculate VU for spin-down
+    std::vector<std::complex<double>> vu_dn(size, {0.0, 0.0});
+    for(int m1 = 0; m1 < m_size; m1++)
+    {
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            vu_dn[m1 * m_size + m2] = U_val *
+                (diag_coeff * (m1 == m2) - locale_dn[m2 * m_size + m1]);
+        }
+    }
+
+    // Verify spin-up VU
+    // diagonal: U*(0.5 - 0.8) = 5*(-0.3) = -1.5
+    for(int m = 0; m < m_size; m++)
+    {
+        EXPECT_DOUBLE_EQ(vu_up[m * m_size + m].real(), -1.5);
+        EXPECT_DOUBLE_EQ(vu_up[m * m_size + m].imag(), 0.0);
+    }
+    // off-diagonal: U*(0 - 0) = 0
+    EXPECT_DOUBLE_EQ(vu_up[0 * m_size + 1].real(), 0.0);
+    EXPECT_DOUBLE_EQ(vu_up[1 * m_size + 0].real(), 0.0);
+
+    // Verify spin-down VU
+    // diagonal: U*(0.5 - 0.2) = 5*(0.3) = 1.5
+    for(int m = 0; m < m_size; m++)
+    {
+        EXPECT_DOUBLE_EQ(vu_dn[m * m_size + m].real(), 1.5);
+        EXPECT_DOUBLE_EQ(vu_dn[m * m_size + m].imag(), 0.0);
+    }
+    // off-diagonal: U*(0 - 0) = 0
+    EXPECT_DOUBLE_EQ(vu_dn[0 * m_size + 1].real(), 0.0);
+    EXPECT_DOUBLE_EQ(vu_dn[1 * m_size + 0].real(), 0.0);
+
+    // Verify energy calculation
+    double energy_u = 0.0;
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            energy_u += U_val * weight_eu * locale_up[m2 * m_size + m1] * locale_up[m1 * m_size + m2];
+            energy_u += U_val * weight_eu * locale_dn[m2 * m_size + m1] * locale_dn[m1 * m_size + m2];
+        }
+    // Only diagonal: 5 orbitals per spin channel
+    // spin-up: 5 * U * weight_eu * 0.8*0.8 = 5 * 5.0 * 0.5 * 0.64 = 8.0
+    // spin-down: 5 * U * weight_eu * 0.2*0.2 = 5 * 5.0 * 0.5 * 0.04 = 0.5
+    // total = 8.5
+    EXPECT_DOUBLE_EQ(energy_u, 8.5);
+}
+
+// =====================================================================
+// Step 2: Test vu_device sync for nspin=2
+// This verifies the vu transfer from eff_pot_pw to vu_device
+// =====================================================================
+
+TEST_F(DftuPwTest, VU_DeviceSync_Nspin2)
+{
+    // Simulate eff_pot_pw layout for nspin=2
+    const int m_size = 5;
+    const int size = m_size * m_size;
+    const int total_size = size * 2; // spin-up + spin-down
+
+    std::vector<std::complex<double>> eff_pot_pw(total_size);
+    // Initialize with known values
+    for(int i = 0; i < size; i++)
+    {
+        eff_pot_pw[i] = {static_cast<double>(i + 1), 0.0};         // spin-up
+        eff_pot_pw[i + size] = {static_cast<double>(i + 100), 0.0}; // spin-down
+    }
+
+    // Simulate vu_device sync for spin-down (isk[ik] == 1)
+    const int size_eff_pot_pw = total_size / 2;
+    std::vector<std::complex<double>> vu_device(size_eff_pot_pw);
+    // memcpy from eff_pot_pw[0] + size_eff_pot_pw
+    for(int i = 0; i < size_eff_pot_pw; i++)
+    {
+        vu_device[i] = eff_pot_pw[i + size_eff_pot_pw];
+    }
+
+    // Verify vu_device contains spin-down values
+    for(int i = 0; i < size; i++)
+    {
+        EXPECT_DOUBLE_EQ(vu_device[i].real(), static_cast<double>(i + 100));
+        EXPECT_DOUBLE_EQ(vu_device[i].imag(), 0.0);
+    }
+}
+
+// =====================================================================
+// Step 3: Test onsite_ps_op kernel for nspin=2 (npol=1)
+// This tests the vu application to ps without full ABACUS integration
+// =====================================================================
+
+TEST_F(DftuPwTest, OnsitePsOpKernel_Nspin2_Npol1)
+{
+    // Simulate the npol=1 branch of onsite_ps_op kernel
+    const int npm = 4;   // number of bands (npm/npol for npol=1)
+    const int npol = 1;
+    const int tnp = 10;  // total number of projectors
+    const int orb_l = 2; // d-orbital
+    const int tlp1 = 2 * orb_l + 1; // 5
+    const int nat = 2;
+
+    // vu array: 2 atoms, each with tlp1*tlp1 = 25 elements
+    std::vector<std::complex<double>> vu(nat * tlp1 * tlp1);
+    for(int i = 0; i < nat * tlp1 * tlp1; i++)
+        vu[i] = {static_cast<double>(i + 1), 0.0};
+
+    // ip_m: maps each projector to m index within its atom
+    // First atom (iat=0): projectors 0-4 map to m=0-4
+    // Second atom (iat=1): projectors 5-9 map to m=0-4
+    std::vector<int> ip_m = {0, 1, 2, 3, 4, 0, 1, 2, 3, 4};
+    std::vector<int> ip_iat = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
+    std::vector<int> vu_begin_iat = {0, tlp1 * tlp1};
+
+    // becp: npm * tnp
+    std::vector<std::complex<double>> becp(npm * tnp, {0.0, 0.0});
+    // Set some non-zero becp values
+    for(int ib = 0; ib < npm; ib++)
+        for(int ip = 0; ip < tnp; ip++)
+            becp[ib * tnp + ip] = {static_cast<double>(ib + ip + 1), 0.0};
+
+    // ps: tnp * npm
+    std::vector<std::complex<double>> ps(tnp * npm, {0.0, 0.0});
+
+    // Kernel logic for npol=1 (EXACT copy from onsite_op.cpp)
+    for(int ib = 0; ib < npm; ib++)
+    {
+        for(int ip = 0; ip < tnp; ip++)
+        {
+            int m1 = ip_m[ip];
+            if(m1 < 0) continue;
+            int iat = ip_iat[ip];
+            const std::complex<double>* vu_iat = vu.data() + vu_begin_iat[iat];
+            int ip2_begin = ip - m1;
+            int ip2_end = ip - m1 + tlp1;
+            const int psind = ip * npm + ib;
+            for(int ip2 = ip2_begin; ip2 < ip2_end; ip2++)
+            {
+                const int becpind = ib * tnp + ip2;
+                int m2 = ip_m[ip2];
+                const int index_mm = m1 * tlp1 + m2;
+                ps[psind] += vu_iat[index_mm] * becp[becpind];
+            }
+        }
+    }
+
+    // Verify ps[0] (ib=0, ip=0)
+    // m1=0, iat=0, vu_iat=vu[0..]
+    // ip2 from 0 to 5
+    std::complex<double> expected_ps00 = {0.0, 0.0};
+    for(int ip2 = 0; ip2 < tlp1; ip2++)
+    {
+        const int becpind = 0 * tnp + ip2;
+        int m2 = ip_m[ip2];
+        const int index_mm = 0 * tlp1 + m2;
+        expected_ps00 += vu[index_mm] * becp[becpind];
+    }
+    EXPECT_DOUBLE_EQ(ps[0].real(), expected_ps00.real());
+    EXPECT_DOUBLE_EQ(ps[0].imag(), expected_ps00.imag());
+}
+
+// =====================================================================
+// Step 4: Test spin-up only path (isolate from spin-down)
+// =====================================================================
+
+TEST_F(DftuPwTest, SpinUpOnly_Path_Nspin2)
+{
+    // Test that spin-up calculation is independent and correct
+    const int m_size = 5;
+    const int size = m_size * m_size;
+    const double U_val = 5.0;
+    const double diag_coeff = 0.5;
+
+    // Only set spin-up locale
+    std::vector<double> locale_up(size, 0.0);
+    for(int m = 0; m < m_size; m++)
+        locale_up[m * m_size + m] = 0.8;
+
+    // Calculate VU for spin-up only
+    std::vector<std::complex<double>> vu_up(size, {0.0, 0.0});
+    for(int m1 = 0; m1 < m_size; m1++)
+    {
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            vu_up[m1 * m_size + m2] = U_val *
+                (diag_coeff * (m1 == m2) - locale_up[m2 * m_size + m1]);
+        }
+    }
+
+    // Verify diagonal values
+    for(int m = 0; m < m_size; m++)
+        EXPECT_DOUBLE_EQ(vu_up[m * m_size + m].real(), -1.5); // 5*(0.5-0.8)
+
+    // Verify off-diagonal are zero
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            if(m1 != m2)
+                EXPECT_DOUBLE_EQ(vu_up[m1 * m_size + m2].real(), 0.0);
+}
+
+// =====================================================================
+// Step 5: Test spin-down only path (isolate from spin-up)
+// =====================================================================
+
+TEST_F(DftuPwTest, SpinDownOnly_Path_Nspin2)
+{
+    // Test that spin-down calculation is independent and correct
+    const int m_size = 5;
+    const int size = m_size * m_size;
+    const double U_val = 5.0;
+    const double diag_coeff = 0.5;
+
+    // Only set spin-down locale
+    std::vector<double> locale_dn(size, 0.0);
+    for(int m = 0; m < m_size; m++)
+        locale_dn[m * m_size + m] = 0.2;
+
+    // Calculate VU for spin-down only
+    std::vector<std::complex<double>> vu_dn(size, {0.0, 0.0});
+    for(int m1 = 0; m1 < m_size; m1++)
+    {
+        for(int m2 = 0; m2 < m_size; m2++)
+        {
+            vu_dn[m1 * m_size + m2] = U_val *
+                (diag_coeff * (m1 == m2) - locale_dn[m2 * m_size + m1]);
+        }
+    }
+
+    // Verify diagonal values
+    for(int m = 0; m < m_size; m++)
+        EXPECT_DOUBLE_EQ(vu_dn[m * m_size + m].real(), 1.5); // 5*(0.5-0.2)
+
+    // Verify off-diagonal are zero
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            if(m1 != m2)
+                EXPECT_DOUBLE_EQ(vu_dn[m1 * m_size + m2].real(), 0.0);
+}
+
+// =====================================================================
+// Multi-atom split layout test for nspin=2
+// Verifies that the split layout [all_up | all_dn] works correctly
+// with multiple correlated atoms (the P0-1 bug fix)
+// =====================================================================
+
+TEST_F(DftuPwTest, MultiAtomSplitLayout_Nspin2)
+{
+    // 2 correlated atoms with d-orbital (l=2)
+    const int nat = 2;
+    const int m_size = 5;
+    const int size = m_size * m_size; // 25 per atom per spin
+    const int P = nat * size; // 50 = total spin-up block size
+    const int total = P * 2; // 100 = total array size (split: up|dn)
+
+    // eff_pot_pw_index: split layout, each atom gets `size` entries
+    std::vector<int> eff_pot_pw_index(nat);
+    eff_pot_pw_index[0] = 0;
+    eff_pot_pw_index[1] = size; // 25
+
+    // --- Test uom_array writing (dftu_pw.cpp logic) ---
+    std::vector<double> uom_array(total, 0.0);
+    // Simulate locale values for both atoms
+    std::vector<double> locale_up_0(size, 0.0), locale_dn_0(size, 0.0);
+    std::vector<double> locale_up_1(size, 0.0), locale_dn_1(size, 0.0);
+    for(int m = 0; m < m_size; m++)
+    {
+        locale_up_0[m * m_size + m] = 0.8;
+        locale_dn_0[m * m_size + m] = 0.2;
+        locale_up_1[m * m_size + m] = 0.7;
+        locale_dn_1[m * m_size + m] = 0.3;
+    }
+
+    // Write to uom_array using split layout
+    const int half_size = total / 2; // P = 50
+    // atom 0
+    for(int mm = 0; mm < size; mm++)
+    {
+        uom_array[eff_pot_pw_index[0] + mm] = locale_up_0[mm];
+        uom_array[half_size + eff_pot_pw_index[0] + mm] = locale_dn_0[mm];
+    }
+    // atom 1
+    for(int mm = 0; mm < size; mm++)
+    {
+        uom_array[eff_pot_pw_index[1] + mm] = locale_up_1[mm];
+        uom_array[half_size + eff_pot_pw_index[1] + mm] = locale_dn_1[mm];
+    }
+
+    // Verify split layout: first half = all spin-up, second half = all spin-down
+    // atom 0 up: [0..24]
+    EXPECT_DOUBLE_EQ(uom_array[0], 0.8); // locale_up_0 diagonal
+    // atom 1 up: [25..49]
+    EXPECT_DOUBLE_EQ(uom_array[size + 0], 0.7); // locale_up_1 diagonal
+    // atom 0 dn: [50..74]
+    EXPECT_DOUBLE_EQ(uom_array[half_size + 0], 0.2); // locale_dn_0 diagonal
+    // atom 1 dn: [75..99]
+    EXPECT_DOUBLE_EQ(uom_array[half_size + size + 0], 0.3); // locale_dn_1 diagonal
+
+    // --- Test set_locale reading (dftu_occup.cpp logic) ---
+    std::vector<double> read_up_0(size, 0.0), read_dn_0(size, 0.0);
+    std::vector<double> read_up_1(size, 0.0), read_dn_1(size, 0.0);
+
+    for(int mm = 0; mm < size; mm++)
+    {
+        // atom 0
+        read_up_0[mm] = uom_array[eff_pot_pw_index[0] + mm];
+        read_dn_0[mm] = uom_array[half_size + eff_pot_pw_index[0] + mm];
+        // atom 1
+        read_up_1[mm] = uom_array[eff_pot_pw_index[1] + mm];
+        read_dn_1[mm] = uom_array[half_size + eff_pot_pw_index[1] + mm];
+    }
+
+    for(int mm = 0; mm < size; mm++)
+    {
+        EXPECT_DOUBLE_EQ(read_up_0[mm], locale_up_0[mm]);
+        EXPECT_DOUBLE_EQ(read_dn_0[mm], locale_dn_0[mm]);
+        EXPECT_DOUBLE_EQ(read_up_1[mm], locale_up_1[mm]);
+        EXPECT_DOUBLE_EQ(read_dn_1[mm], locale_dn_1[mm]);
+    }
+
+    // --- Test VU writing (dftu_pw.cpp logic) ---
+    std::vector<std::complex<double>> eff_pot_pw(total, {0.0, 0.0});
+    const double U_val = 5.0;
+    const double diag_coeff = 0.5;
+
+    // atom 0 spin-up VU
+    std::complex<double>* vu_up_0 = &eff_pot_pw[eff_pot_pw_index[0]];
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            vu_up_0[m1 * m_size + m2] = U_val * (diag_coeff * (m1 == m2) - locale_up_0[m2 * m_size + m1]);
+
+    // atom 0 spin-down VU (split layout: offset by half_size)
+    std::complex<double>* vu_dn_0 = &eff_pot_pw[eff_pot_pw.size() / 2 + eff_pot_pw_index[0]];
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            vu_dn_0[m1 * m_size + m2] = U_val * (diag_coeff * (m1 == m2) - locale_dn_0[m2 * m_size + m1]);
+
+    // atom 1 spin-up VU
+    std::complex<double>* vu_up_1 = &eff_pot_pw[eff_pot_pw_index[1]];
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            vu_up_1[m1 * m_size + m2] = U_val * (diag_coeff * (m1 == m2) - locale_up_1[m2 * m_size + m1]);
+
+    // atom 1 spin-down VU
+    std::complex<double>* vu_dn_1 = &eff_pot_pw[eff_pot_pw.size() / 2 + eff_pot_pw_index[1]];
+    for(int m1 = 0; m1 < m_size; m1++)
+        for(int m2 = 0; m2 < m_size; m2++)
+            vu_dn_1[m1 * m_size + m2] = U_val * (diag_coeff * (m1 == m2) - locale_dn_1[m2 * m_size + m1]);
+
+    // Verify VU values
+    // atom 0 up diagonal: 5*(0.5-0.8) = -1.5
+    EXPECT_DOUBLE_EQ(vu_up_0[0].real(), -1.5);
+    // atom 0 dn diagonal: 5*(0.5-0.2) = 1.5
+    EXPECT_DOUBLE_EQ(vu_dn_0[0].real(), 1.5);
+    // atom 1 up diagonal: 5*(0.5-0.7) = -1.0
+    EXPECT_DOUBLE_EQ(vu_up_1[0].real(), -1.0);
+    // atom 1 dn diagonal: 5*(0.5-0.3) = 1.0
+    EXPECT_DOUBLE_EQ(vu_dn_1[0].real(), 1.0);
+
+    // Verify no overlap between atoms in VU arrays
+    // atom 0 up ends at index 24, atom 1 up starts at 25 — no overlap
+    EXPECT_NE(vu_up_0[0], vu_up_1[0]);
+    // atom 0 dn starts at half_size=50, atom 1 dn starts at half_size+25=75 — no overlap
+    EXPECT_NE(vu_dn_0[0], vu_dn_1[0]);
+}
+
+// =====================================================================
+// Test that split layout copy_locale/uom_save is consistent
+// with set_locale/uom_array round-trip for multi-atom nspin=2
+// =====================================================================
+
+TEST_F(DftuPwTest, RoundTripCopyAndSetLocale_Nspin2_MultiAtom)
+{
+    const int nat = 2;
+    const int m_size = 5;
+    const int size = m_size * m_size;
+    const int P = nat * size;
+    const int total = P * 2;
+
+    std::vector<int> eff_pot_pw_index = {0, size};
+    std::vector<double> uom_save(total, 0.0);
+    std::vector<double> uom_array(total, 0.0);
+
+    // Simulate locale values
+    std::vector<std::vector<double>> locale_up(nat, std::vector<double>(size, 0.0));
+    std::vector<std::vector<double>> locale_dn(nat, std::vector<double>(size, 0.0));
+    for(int iat = 0; iat < nat; iat++)
+        for(int m = 0; m < m_size; m++)
+        {
+            locale_up[iat][m * m_size + m] = 0.9 - iat * 0.1;
+            locale_dn[iat][m * m_size + m] = 0.1 + iat * 0.1;
+        }
+
+    // copy_locale -> uom_save (split layout)
+    const int half_size = total / 2;
+    for(int iat = 0; iat < nat; iat++)
+        for(int mm = 0; mm < size; mm++)
+        {
+            uom_save[eff_pot_pw_index[iat] + mm] = locale_up[iat][mm];
+            uom_save[half_size + eff_pot_pw_index[iat] + mm] = locale_dn[iat][mm];
+        }
+
+    // cal_occ_pw -> uom_array (split layout)
+    for(int iat = 0; iat < nat; iat++)
+        for(int mm = 0; mm < size; mm++)
+        {
+            uom_array[eff_pot_pw_index[iat] + mm] = locale_up[iat][mm];
+            uom_array[half_size + eff_pot_pw_index[iat] + mm] = locale_dn[iat][mm];
+        }
+
+    // Mixing would compare uom_array with uom_save — verify they match
+    for(int i = 0; i < total; i++)
+        EXPECT_DOUBLE_EQ(uom_array[i], uom_save[i]);
+
+    // set_locale reads back from uom_array
+    std::vector<std::vector<double>> read_up(nat, std::vector<double>(size, 0.0));
+    std::vector<std::vector<double>> read_dn(nat, std::vector<double>(size, 0.0));
+    for(int iat = 0; iat < nat; iat++)
+        for(int mm = 0; mm < size; mm++)
+        {
+            read_up[iat][mm] = uom_array[eff_pot_pw_index[iat] + mm];
+            read_dn[iat][mm] = uom_array[half_size + eff_pot_pw_index[iat] + mm];
+        }
+
+    // Verify round-trip consistency
+    for(int iat = 0; iat < nat; iat++)
+        for(int mm = 0; mm < size; mm++)
+        {
+            EXPECT_DOUBLE_EQ(read_up[iat][mm], locale_up[iat][mm]);
+            EXPECT_DOUBLE_EQ(read_dn[iat][mm], locale_dn[iat][mm]);
+        }
+}
+
+// =====================================================================
+// get_locale_flat / set_locale_flat logic tests (pure arithmetic)
+//
+// These test the nspin-dependent packing/unpacking logic without
+// requiring a Plus_U instance, by simulating the same operations.
+// =====================================================================
+
+TEST_F(DftuPwTest, LocaleFlatPackNspin1)
+{
+    PARAM.input.nspin = 1;
+    const int tlp1 = 3;
+    const int size = tlp1 * tlp1;
+    std::vector<double> locale_spin0(size);
+    for (int i = 0; i < size; i++) locale_spin0[i] = static_cast<double>(i);
+    std::vector<double> occ(size);
+    for (int i = 0; i < size; i++) occ[i] = locale_spin0[i];
+    for (int i = 0; i < size; i++) EXPECT_DOUBLE_EQ(occ[i], static_cast<double>(i));
+}
+
+TEST_F(DftuPwTest, LocaleFlatPackNspin2)
+{
+    PARAM.input.nspin = 2;
+    const int tlp1 = 3;
+    const int size = tlp1 * tlp1;
+    std::vector<double> locale_spin0(size), locale_spin1(size);
+    for (int i = 0; i < size; i++)
+    {
+        locale_spin0[i] = static_cast<double>(i);
+        locale_spin1[i] = static_cast<double>(i + 100);
+    }
+    std::vector<double> occ(2 * size);
+    for (int i = 0; i < size; i++)
+    {
+        occ[i] = locale_spin0[i];
+        occ[size + i] = locale_spin1[i];
+    }
+    for (int i = 0; i < size; i++)
+    {
+        EXPECT_DOUBLE_EQ(occ[i], static_cast<double>(i));
+        EXPECT_DOUBLE_EQ(occ[size + i], static_cast<double>(i + 100));
+    }
+}
+
+TEST_F(DftuPwTest, LocaleFlatSetRoundTrip)
+{
+    const int tlp1 = 2;
+    const int size = tlp1 * tlp1;
+    std::vector<double> locale_data(size, 0.0);
+    std::vector<double> occ(size);
+    for (int i = 0; i < size; i++) occ[i] = static_cast<double>(i + 50);
+    for (int i = 0; i < size; i++) locale_data[i] = occ[i];
+    for (int i = 0; i < size; i++)
+        EXPECT_DOUBLE_EQ(locale_data[i], static_cast<double>(i + 50));
+}
diff --git a/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp b/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp
index 9b5958e4056..38c96025fa5 100644
--- a/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp
+++ b/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp
@@ -49,7 +49,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
         int T0=0;
         int I0=0;
         ucell->iat2iait(iat0, &I0, &T0);
-        if(this->dftu->orbital_corr[T0] == -1)
+        if(!this->dftu->has_correlated_orbital(T0))
         {
             continue;
         }
@@ -71,11 +71,11 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
         int T0=0;
         int I0=0;
         ucell->iat2iait(iat0, &I0, &T0);
-        const int target_L = this->dftu->orbital_corr[T0];
-		if (target_L == -1) 
-		{
-			continue;
-		}
+        if (!this->dftu->has_correlated_orbital(T0))
+        {
+            continue;
+        }
+        const int target_L = this->dftu->get_orbital_corr(T0);
         const int tlp1 = 2 * target_L + 1;
         AdjacentAtomInfo& adjs = this->adjs_all[atom_index_all[iat0]];
 
@@ -139,22 +139,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
         }
         // first iteration to calculate occupation matrix
         std::vector<double> occ(tlp1 * tlp1 * this->nspin, 0);
-        if(this->nspin ==2)
-        {
-            for (int i = 0; i < occ.size(); i++)
-            {
-                const int is = i / (tlp1 * tlp1);
-                const int ii = i % (tlp1 * tlp1);
-                occ[i] = this->dftu->locale[iat0][target_L][0][is].c[ii];
-            }
-        }
-        else
-        {
-            for (int i = 0; i < occ.size(); i++)
-            {
-                occ[i] = this->dftu->locale[iat0][target_L][0][0].c[i];
-            }
-        }
+        this->dftu->get_locale_flat(iat0, target_L, occ);
 
         // calculate VU
         const double u_value = this->dftu->U[T0];
diff --git a/source/source_lcao/module_operator_lcao/dftu_lcao.cpp b/source/source_lcao/module_operator_lcao/dftu_lcao.cpp
index 3189f05f13c..e33f4962352 100644
--- a/source/source_lcao/module_operator_lcao/dftu_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/dftu_lcao.cpp
@@ -55,11 +55,11 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(const Grid_Driver
         int T0=0;
         int I0=0;
         ucell->iat2iait(iat0, &I0, &T0);
-        const int target_L = this->dftu->orbital_corr[T0];
-		if (target_L == -1) 
-		{
-			continue;
-		}
+        if (!this->dftu->has_correlated_orbital(T0))
+        {
+            continue;
+        }
+        const int target_L = this->dftu->get_orbital_corr(T0);
 
         AdjacentAtomInfo adjs;
         GridD->Find_atom(*ucell, tau0, T0, I0, &adjs);
@@ -107,12 +107,12 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
         int T0=0;
         int I0=0;
         ucell->iat2iait(iat0, &I0, &T0);
-        const int target_L = this->dftu->orbital_corr[T0];
-		if (target_L == -1) 
-		{
-			continue;
-		}
-		const int tlp1 = 2 * target_L + 1;
+        if (!this->dftu->has_correlated_orbital(T0))
+        {
+            continue;
+        }
+        const int target_L = this->dftu->get_orbital_corr(T0);
+        const int tlp1 = 2 * target_L + 1;
         AdjacentAtomInfo& adjs = this->adjs_all[atom_index++];
 
         // calculate and save the table of two-center integrals
@@ -173,17 +173,75 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
 }
 
 // contributeHR()
+/**
+ * @brief Contribute DFT+U Hamiltonian to real-space HR matrix
+ * 
+ * @details This function handles different scenarios based on:
+ * 1. Whether locale (occupation matrix) is read from file (is_locale_initialized)
+ * 2. Spin configuration (nspin=1, 2, or 4)
+ * 3. SCF iteration stage (first vs subsequent iterations)
+ * 
+ * Case 1: Locale NOT initialized (!is_locale_initialized)
+ *   - First electronic iteration: calculates occupation matrix from density matrix (DMR)
+ *     * Uses get_dmr(current_spin) to get real-space density matrix
+ *     * Accumulates contributions from all atom pairs via cal_occ()
+ *     * Performs MPI reduction to sum occ across processes
+ *     * Stores result via set_locale_flat() for use in VU calculation
+ *     * For nspin=1: occ is scaled by 0.5 (since only one spin channel computed)
+ *   - Subsequent iterations: locale is computed fresh each iteration from updated DMR
+ * 
+ * Case 2: Locale IS initialized (is_locale_initialized, i.e., read from onsite.dm file)
+ *   - First electronic iteration: uses pre-read locale directly without DMR calculation
+ *     * Skips DMR-based occ calculation entirely
+ *     * Reads locale from stored data via get_locale()
+ *     * Different indexing for nspin=4 vs nspin=1/2 (see below)
+ *   - After first iteration: mark_locale_dirty() is called to force recomputation
+ * 
+ * Spin configurations:
+ *   nspin=1 (non-spin-polarized):
+ *     - Single spin channel, occ computed once
+ *     - Energy correction doubled at end (set_double_energy)
+ *     - current_spin always 0
+ *   
+ *   nspin=2 (collinear spin-polarized):
+ *     - Two separate spin channels (spin-up: 0, spin-down: 1)
+ *     - current_spin toggles between 0 and 1 across iterations
+ *     - mark_locale_dirty() called when current_spin == 1 (last spin)
+ *     - HR accumulated separately for each spin
+ *   
+ *   nspin=4 (non-collinear/SOC):
+ *     - Single 4x4 Pauli matrix representation per atom
+ *     - occ has 4*(2l+1)^2 elements (spin_fold=4)
+ *     - get_locale uses spin=0, ipol indices for Pauli blocks
+ *     - mark_locale_dirty() always called (current_spin check always true)
+ *     - No current_spin toggling (all spins handled simultaneously)
+ * 
+ * @warning THREAD SAFETY: cal_HR_IJR() updates shared HR matrix entries.
+ *          Different iat0 may contribute to same HR(iat1, iat2, R), requiring
+ *          critical section protection for multithreaded correctness.
+ *          TODO: Consider refactoring to atom_row_list pattern (see nonlocal.cpp)
+ *          for better parallel performance instead of critical section.
+ */
 template <typename TK, typename TR>
 void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
 {
     ModuleBase::TITLE("DFTU", "contributeHR");
-    if (this->dftu->get_dmr(0) == nullptr && this->dftu->initialed_locale == false)
-    { // skip the calculation if dm_in_dftu is nullptr
+    // Early exit conditions:
+    // - get_dmr(0) == nullptr: DMR not available (typical in first iteration without file input)
+    // - !is_locale_initialized(): locale not read from file AND not yet computed from DMR
+    // When both true, skip DFT+U contribution entirely (first iteration, no file input)
+    const bool dmr_null = (this->dftu->get_dmr(0) == nullptr);
+    const bool locale_not_init = !this->dftu->is_locale_initialized();
+
+    if (dmr_null && locale_not_init)
+    {
         return;
     }
     else
     {
-        // will update this->dftu->locale and this->dftu->EU
+        // Reset DFT+U energy at start of each spin cycle
+        // For nspin=2: reset when current_spin==0 (start of spin-up calculation)
+        // For nspin=1/4: reset once (current_spin always 0)
 		if (this->current_spin == 0) 
 		{
             this->dftu->set_energy(0.0);
@@ -193,30 +251,43 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
 
     const Parallel_Orbitals* paraV = this->hR->get_atom_pair(0).get_paraV();
     const int npol = this->ucell->get_npol();
-    // 1. calculate <psi|alpha> for each pair of atoms
+    // 1. Calculate <psi|alpha> two-center integrals for all atom pairs
+    //    This is reused in both occ and HR calculations
     this->cal_nlm_all(paraV);
-    // loop over all on-site atoms
+
+    // 2. Loop over all Hubbard-projector center atoms (iat0)
     int atom_index = 0;
     for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
     {
-        // skip the atoms without plus-U
         auto tau0 = ucell->get_tau(iat0);
         int T0, I0;
         ucell->iat2iait(iat0, &I0, &T0);
-        const int target_L = this->dftu->orbital_corr[T0];
-		if (target_L == -1) 
-		{
-			continue;
-		}
+        if (!this->dftu->has_correlated_orbital(T0))
+        {
+            continue;
+        }
+        const int target_L = this->dftu->get_orbital_corr(T0);
         const int tlp1 = 2 * target_L + 1;
         AdjacentAtomInfo& adjs = this->adjs_all[atom_index++];
 
         ModuleBase::timer::start("DFTU", "cal_occ");
-        // first iteration to calculate occupation matrix
+        // spin_fold: number of spin components in occ array
+        // nspin=4: 4 (Pauli matrix blocks), nspin=1/2: 1 (single spin channel)
         const int spin_fold = (this->nspin == 4) ? 4 : 1;
         std::vector<double> occ(tlp1 * tlp1 * spin_fold, 0.0);
-        if (this->dftu->initialed_locale == false)
+        
+        // ============================================================
+        // BRANCH 1: Locale NOT initialized (compute from DMR)
+        // ============================================================
+        // This branch is taken when:
+        // - is_locale_initialized() == false (no file read or omc != 0)
+        // - DMR is available (get_dmr() != nullptr)
+        // Typical scenario: normal SCF iterations after first step
+        if (!this->dftu->is_locale_initialized())
         {
+            // TODO: UNSAFE - get_dmr(current_spin) assumes DMR has correct spin indexing.
+            // For nspin=2, current_spin must be correctly toggled (0 then 1).
+            // If current_spin is wrong, wrong spin channel's DMR is used.
             const hamilt::HContainer<double>* dmR_current = this->dftu->get_dmr(this->current_spin);
             for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
             {
@@ -237,7 +308,6 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
                                                       R_index2[2] - R_index1[2]);
                     const hamilt::BaseMatrix<double>* tmp
                         = dmR_current->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
-                    // if not found , skip this pair of atoms
                     if (tmp != nullptr)
                     {
                         this->cal_occ(iat1, iat2, paraV, nlm1, nlm2, tmp->get_pointer(), occ);
@@ -245,46 +315,100 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
                 }
             }
 #ifdef __MPI
-            // sum up the occupation matrix
+            // CRITICAL: MPI reduction required for distributed DMR calculations.
+            // Each process computes partial occ from its local DMR blocks.
+            // Without this, occ would be incomplete and DFT+U potential wrong.
+            // TODO: Verify that occ size is consistent across processes before reduction.
+            // TODO: Consider using MPI_IN_PLACE to avoid extra buffer allocation.
             Parallel_Reduce::reduce_all(occ.data(), occ.size());
 #endif
-            // save occ to dftu
-            for (int i = 0; i < occ.size(); i++)
+            // For nspin=1: occ computed from single spin channel, but should represent
+            // total occupation (both spins). Scale by 0.5 to account for this.
+            if (this->nspin == 1)
             {
-				if (this->nspin == 1) 
-				{
-					occ[i] *= 0.5;
-				}
-                this->dftu->locale[iat0][target_L][0][this->current_spin].c[i] = occ[i];
+                for (auto& v : occ) { v *= 0.5; }
             }
+            this->dftu->set_locale_flat(iat0, target_L, this->current_spin, occ);
         }
-        else // use readin locale to calculate occupation matrix
+        // ============================================================
+        // BRANCH 2: Locale IS initialized (use pre-read data)
+        // ============================================================
+        // This branch is taken when:
+        // - is_locale_initialized() == true (locale read from onsite.dm file)
+        // - OR omc != 0 (occupation matrix control with initial_onsite.dm)
+        // Typical scenario: first SCF iteration with file input, or restart calculation
+        else
         {
-            for (int i = 0; i < occ.size(); i++)
+            // nspin=4: Non-collinear case with Pauli matrix representation
+            // Locale stored as single 4x4 block per atom, with spin indices embedded
+            // in the matrix indices (ipol0, ipol1 for Pauli block indices)
+            if (this->nspin == 4)
+            {
+                const int tlp1_local = 2 * target_L + 1;
+                const int m_size2_local = tlp1_local * tlp1_local;
+                for (int i = 0; i < static_cast<int>(occ.size()); i++)
+                {
+                    // Decode flattened index to (Pauli_block, m, m') format
+                    const int ib = i / m_size2_local;          // Pauli block index (0-3)
+                    const int m = (i % m_size2_local) / tlp1_local;  // m quantum number
+                    const int m2_val = (i % m_size2_local) % tlp1_local; // m' quantum number
+                    const int ipol0 = ib / npol;               // Row Pauli index
+                    const int ipol1 = ib % npol;               // Column Pauli index
+                    const int m0_all = m + ipol0 * tlp1_local; // Combined row index
+                    const int m1_all = m2_val + ipol1 * tlp1_local; // Combined col index
+                    // TODO: UNSAFE - get_locale indices must match storage format exactly.
+                    // Mismatch in indexing between set_locale_flat and get_locale causes silent corruption.
+                    // TODO: Add bounds checking for m0_all, m1_all against locale array dimensions.
+                    occ[i] = this->dftu->get_locale(iat0, target_L, 0, 0, m0_all, m1_all);
+                }
+            }
+            // nspin=1 or nspin=2: Collinear spin case
+            // Locale stored separately for each spin channel
+            else
             {
-                occ[i] = this->dftu->locale[iat0][target_L][0][this->current_spin].c[i];
+                for (int i = 0; i < static_cast<int>(occ.size()); i++)
+                {
+                    // TODO: UNSAFE - current_spin must be correct for nspin=2.
+                    // If current_spin is not toggled properly, wrong spin channel's locale is read.
+                    // This can happen if contributeHR() is called out of expected order.
+                    occ[i] = this->dftu->get_locale(iat0, target_L, 0, this->current_spin,
+                                                      i / (2 * target_L + 1), i % (2 * target_L + 1));
+                }
             }
-            // set initialed_locale to false to avoid using readin locale in next iteration
         }
         ModuleBase::timer::end("DFTU", "cal_occ");
 
-        // calculate VU
+        // 3. Calculate Hubbard potential VU from occupation matrix
+        // VU = U * (1/2 * delta(m,m') - occ(m,m')) for each spin channel
+        // Energy: EU = U * 1/2 * occ(m,m') * occ(m',m)
         ModuleBase::timer::start("DFTU", "cal_vu");
         const double u_value = this->dftu->U[T0];
         std::vector<double> VU_tmp(occ.size());
 
-        // mohan add 2025-11-08
+        // TODO: GLOBAL STATE - Plus_U::get_energy()/set_energy() uses static member variable.
+        // This is NOT thread-safe for parallel SCF calculations.
+        // TODO: Refactor to use instance member or pass energy by reference.
         double u_energy = Plus_U::get_energy();
         this->cal_v_of_u(occ, tlp1, u_value, VU_tmp.data(), u_energy);
         Plus_U::set_energy(u_energy);
 
-        // transfer occ from pauli matrix format to normal format
+        // 4. Convert VU to appropriate data type (real or complex)
+        // For nspin=4 with complex Hamiltonian, VU needs Pauli matrix transformation
         std::vector<TR> VU(occ.size());
         this->transfer_vu(VU_tmp, VU);
 
-        // second iteration to calculate Hamiltonian matrix
-        // calculate <psi_I|beta_m> U*(1/2*delta(m, m')-occ(m, m')) <beta_m'|psi_{J,R}> for each pair of <IJR> atoms
-        // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
+        // 5. Second iteration: Calculate Hamiltonian matrix contribution
+        // HR += <psi_I|beta_m> * VU(m,m') * <beta_m'|psi_{J,R}>
+        // for all atom pairs <I,J,R> within cutoff
+        // Note: different iat0 may contribute to the same HR(iat1, iat2, R), so we need to protect the update
+        // to avoid race conditions in multithreading. Reference: nonlocal.cpp for the atom_row_list pattern.
+        // TODO: CRITICAL SECTION PERFORMANCE - This critical section serializes HR updates.
+        // For systems with many Hubbard atoms, this becomes a bottleneck.
+        // Consider refactoring to atom_row_list pattern (see nonlocal.cpp lines 127-220):
+        //   1. Use #pragma omp for to distribute iat0 across threads
+        //   2. Each thread records its assigned iat0 in thread-local atom_row_list
+        //   3. When updating HR(iat1, iat2, R), skip if iat1 not in thread's atom_row_list
+        //   4. This eliminates race conditions without critical section
         for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
         {
             const int T1 = adjs.ntype[ad1];
@@ -303,30 +427,51 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
                                                   R_index2[1] - R_index1[1],
                                                   R_index2[2] - R_index1[2]);
                 hamilt::BaseMatrix<TR>* tmp = this->hR->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
-                // if not found , skip this pair of atoms
                 if (tmp != nullptr)
                 {
-                    this->cal_HR_IJR(iat1, iat2, paraV, nlm1, nlm2, VU, tmp->get_pointer());
+#ifdef _OPENMP
+#pragma omp critical(dftu_hr_update)
+#endif
+                    {
+                        this->cal_HR_IJR(iat1, iat2, paraV, nlm1, nlm2, VU, tmp->get_pointer());
+                    }
                 }
             }
         }
         ModuleBase::timer::end("DFTU", "cal_vu");
     }
 
-    // energy correction for NSPIN=1
+    // 6. Post-processing: Energy correction and locale state management
+    // For nspin=1: DFT+U energy computed for single spin channel, but should count both spins
+    // set_double_energy() doubles the energy to account for degenerate spin-up/down
 	if (this->nspin == 1) 
 	{
         this->dftu->set_double_energy();
 	}
-	// for readin onsite_dm, set initialed_locale to false to avoid using readin locale in next iteration
+	
+    // 7. Mark locale as dirty to force recomputation in next iteration
+    // This is called when:
+    // - nspin=4: Always (all spins handled simultaneously, current_spin==0==nspin-1)
+    // - nspin=2: When current_spin==1 (after spin-down calculation, last spin channel)
+    // - nspin=1: When current_spin==0==nspin-1 (always called)
+    // 
+    // Purpose: Ensure locale is recomputed from updated DMR in next SCF iteration,
+    // rather than using stale pre-read data from file.
+    // TODO: This logic is confusing. Consider explicit variable like `is_last_spin_channel`.
 	if (this->current_spin == this->nspin - 1 || this->nspin == 4) 
 	{
-		this->dftu->initialed_locale = false;
+		this->dftu->mark_locale_dirty();
 	}
 
-    // update this->current_spin: only nspin=2 iterate change it between 0 and 1
-    // the key point is only nspin=2 calculate spin-up and spin-down separately,
-    // and won't calculate spin-up twice without spin-down
+    // 8. Spin channel toggling for nspin=2
+    // nspin=2 requires separate HR updates for spin-up (current_spin=0) and spin-down (current_spin=1)
+    // The HR matrix is updated twice per SCF iteration, once for each spin channel
+    // current_spin toggles: 0 -> 1 -> 0 -> 1 ...
+    // For nspin=1: current_spin always 0 (no toggling needed)
+    // For nspin=4: current_spin always 0 (all spins handled simultaneously via Pauli matrices)
+    // TODO: UNSAFE - This assumes contributeHR() is called in strict alternating order.
+    // If called out of order (e.g., due to parallel k-point distribution), current_spin may be wrong.
+    // TODO: Consider deriving current_spin from ik or explicit parameter instead of toggling.
 	if (this->nspin == 2) 
 	{
 		this->current_spin = 1 - this->current_spin;
diff --git a/source/source_lcao/module_operator_lcao/dspin_lcao.cpp b/source/source_lcao/module_operator_lcao/dspin_lcao.cpp
index 7954ae8ab22..996e76843a2 100644
--- a/source/source_lcao/module_operator_lcao/dspin_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/dspin_lcao.cpp
@@ -29,6 +29,8 @@ hamilt::DeltaSpin<hamilt::OperatorLCAO<TK, TR>>::DeltaSpin(HS_Matrix_K<TK>* hsk_
 
     this->lambda_save.resize(this->ucell->nat * 3, 0.0);
     this->update_lambda_.resize(this->nspin, false);
+    this->B_I_data.resize(this->ucell->nat);
+    this->B_I_nproj.resize(this->ucell->nat, 0);
 }
 
 // destructor
@@ -67,16 +69,17 @@ void hamilt::DeltaSpin<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
     // if lambda has not changed, calculate the HR^I = lambda^I\sum_{lm}<phi_mu|alpha^I_{lm}><alpha^I_{lm}|phi_{nu,R}>
     // if lambda has changed, calculate the dHR^I = dlambda^I\sum_{lm}<phi_mu|alpha^I_{lm}><alpha^I_{lm}|phi_{nu,R}> 
     spinconstrain::SpinConstrain<TK>& sc = spinconstrain::SpinConstrain<TK>::getScInstance();
-    // there are three case for contributeHR 
-    // 1. HR has not been calculated, reset lambda_save and calculate HR = lambda * pre_hr
-    // 2. HR has been calculated, but lambda has changed, calculate dHR = dlambda * pre_hr
-    // 3. HR has been calculated, and lambda has not changed, do nothing
+    // there are three case for contributeHR
+    // 1. HR is being rebuilt from scratch (hr_done=false): reset lambda_save and add full lambda
+    // 2. HR exists but lambda has changed (hr_done=true, sc_hr_done=false or update_lambda_=true):
+    //    compute incremental delta = lambda - lambda_save and add to existing HR
+    // 3. HR exists and lambda has not changed: do nothing
     if(!this->hr_done)
     {
-        // set the lambda_save to zero if lambda loop is started
+        // HR is being rebuilt from scratch, so the old DS contribution is gone
         this->lambda_save.assign(this->ucell->nat * 3, 0.0);
     }
-    else if(this->hr_done && !this->update_lambda_[this->current_spin])
+    else if(this->sc_hr_done && !this->update_lambda_[this->current_spin])
     {
         return;
     }
@@ -166,6 +169,7 @@ void hamilt::DeltaSpin<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
             }
         }
     }
+    this->sc_hr_done = true;
     return;
 }
 
@@ -346,6 +350,18 @@ void hamilt::DeltaSpin<hamilt::OperatorLCAO<TK, TR>>::cal_pre_HR()
             }
         }
 
+        // Save B_I overlap data for subspace projection optimization
+        this->B_I_data[iat].clear();
+        this->B_I_nproj[iat] = max_l_plus_1 * max_l_plus_1;
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            BI_AdjacentData bi_ad;
+            bi_ad.iat_adj = this->ucell->itia2iat(adjs.ntype[ad], adjs.natom[ad]);
+            bi_ad.R_index = adjs.box[ad];
+            bi_ad.nlm = nlm_iat0[ad];
+            this->B_I_data[iat].push_back(std::move(bi_ad));
+        }
+
         // fourth step: calculate the <phi|alpha><alpha|phi>
         for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
         {
@@ -525,6 +541,89 @@ void hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, std::complex<d
     moment[2] += tmp_moment[2].real();
 }
 
+// cal_PI_sub: compute P_I_sub(k) = D_I(k)^dag D_I(k) for all constrained atoms
+// D_I(k) = B_I(k) * C_k, where B_I(k)[lm, mu] = sum_R <alpha_I_lm|phi_{mu,R}> exp(ik·R)
+// C_k is the 2D-block distributed wavefunction matrix
+template <typename TK, typename TR>
+void hamilt::DeltaSpin<hamilt::OperatorLCAO<TK, TR>>::cal_PI_sub(
+    const ModuleBase::Vector3<double>& kvec_d,
+    const std::complex<double>* psi_k,
+    const int nbands_global,
+    std::vector<std::vector<std::complex<double>>>& PI_sub) const
+{
+    const int nat = this->ucell->nat;
+    PI_sub.resize(nat);
+
+    const int nrow_local = this->paraV->get_row_size();   // local rows of C_k
+    const int ncol_local = this->paraV->ncol_bands;        // local band columns of C_k
+    const int lda = nrow_local;  // leading dimension (column-major for ScaLAPACK)
+
+    for (int iat = 0; iat < nat; iat++)
+    {
+        if (!this->constraint_atom_list[iat])
+        {
+            PI_sub[iat].clear();
+            continue;
+        }
+
+        const int r = this->B_I_nproj[iat];
+        // D_I_local: r × nbands_global, initialized to zero
+        // We accumulate local contributions, then MPI_Allreduce
+        std::vector<std::complex<double>> D_I(r * nbands_global, {0.0, 0.0});
+
+        for (const auto& bi_ad : this->B_I_data[iat])
+        {
+            // Phase factor: exp(i * 2pi * k · R)
+            const double arg = 2.0 * M_PI * (kvec_d.x * bi_ad.R_index.x
+                                            + kvec_d.y * bi_ad.R_index.y
+                                            + kvec_d.z * bi_ad.R_index.z);
+            const std::complex<double> phase(cos(arg), sin(arg));
+
+            for (const auto& [iw_global, nlm_vec] : bi_ad.nlm)
+            {
+                // Check if this global orbital index is in our local rows
+                const int iw_local = this->paraV->global2local_row(iw_global);
+                if (iw_local < 0) { continue;
+                }
+
+                // D_I[lm, jb_global] += nlm_vec[lm] * phase * C_k[iw_local, jb_local]
+                // C_k is column-major: C_k[irow, icol] = psi_k[irow + icol * lda]
+                for (int jb_local = 0; jb_local < ncol_local; jb_local++)
+                {
+                    const int jb_global = this->paraV->local2global_col(jb_local);
+                    const std::complex<double> c_val = phase * psi_k[iw_local + jb_local * lda];
+                    for (int lm = 0; lm < r; lm++)
+                    {
+                        D_I[lm * nbands_global + jb_global] += nlm_vec[lm] * c_val;
+                    }
+                }
+            }
+        }
+
+        // MPI_Allreduce to sum D_I across all processes
+#ifdef __MPI
+        MPI_Allreduce(MPI_IN_PLACE, D_I.data(), 2 * r * nbands_global,
+                      MPI_DOUBLE, MPI_SUM, this->paraV->comm());
+#endif
+
+        // Compute P_I_sub = D_I^dag D_I (nbands × nbands Hermitian matrix)
+        // Using zgemm: C = alpha * A^H * B + beta * C
+        // A = D_I (r × nbands), B = D_I (r × nbands)
+        // C = P_I_sub (nbands × nbands)
+        PI_sub[iat].resize(nbands_global * nbands_global, {0.0, 0.0});
+        const std::complex<double> one = {1.0, 0.0};
+        const std::complex<double> zero_c = {0.0, 0.0};
+        // zgemm: P = D^H * D, where D is r × nbands (row-major: D[lm][jb])
+        // In column-major (Fortran) convention for BLAS:
+        // D stored as nbands_global × r (transposed view)
+        // We want P = D^H * D = (r×nb)^H * (r×nb) = nb×nb
+        zgemm_("C", "N", &nbands_global, &nbands_global, &r,
+               &one, D_I.data(), &r,
+               D_I.data(), &r,
+               &zero_c, PI_sub[iat].data(), &nbands_global);
+    }
+}
+
 #include "dspin_force_stress.hpp"
 
 template class hamilt::DeltaSpin<hamilt::OperatorLCAO<double, double>>;
diff --git a/source/source_lcao/module_operator_lcao/dspin_lcao.h b/source/source_lcao/module_operator_lcao/dspin_lcao.h
index 291d2b87d9f..b4ea23510e6 100644
--- a/source/source_lcao/module_operator_lcao/dspin_lcao.h
+++ b/source/source_lcao/module_operator_lcao/dspin_lcao.h
@@ -8,6 +8,7 @@
 #include "source_lcao/module_operator_lcao/operator_lcao.h"
 #include "source_lcao/module_hcontainer/hcontainer.h"
 #include <unordered_map>
+#include <complex>
 
 namespace hamilt
 {
@@ -48,6 +49,12 @@ class DeltaSpin<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     */
     std::vector<double> cal_moment(const HContainer<double>* dmR, const std::vector<ModuleBase::Vector3<int>>& constrain);
 
+    /// @brief Reset initialization state to allow re-constraint with new constrain array
+    void reset_initialized()
+    {
+        this->initialized = false;
+    }
+
     /**
      * @brief set the update_lambda_ to true, which means the lambda will be updated in the next contributeHR()
     */
@@ -57,6 +64,24 @@ class DeltaSpin<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
         {
             this->update_lambda_[is] = true;
         }
+        // Reset sc_hr_done so contributeHR() recalculates DeltaSpin HR
+        // in the next k-point loop (avoids accumulation across k-points)
+        this->sc_hr_done = false;
+    }
+
+    /**
+     * @brief Shadow set_current_spin to reset sc_hr_done on spin switch (nspin=2).
+     * In the lambda loop, refresh_times=0 so the shared hr_done is NOT reset on
+     * spin switch. sc_hr_done must be reset here so each spin's HR is computed
+     * independently.
+     */
+    void set_current_spin(const int current_spin_in)
+    {
+        if (this->current_spin != current_spin_in)
+        {
+            this->sc_hr_done = false;
+        }
+        OperatorLCAO<TK, TR>::set_current_spin(current_spin_in);
     }
 
     /// calculate force and stress for DFT+U
@@ -66,6 +91,18 @@ class DeltaSpin<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
                           ModuleBase::matrix& force,
                           ModuleBase::matrix& stress);
 
+    /// @brief Compute P_I_sub(k) = D_I(k)^dag D_I(k) for all constrained atoms
+    /// Uses saved B_I overlaps and 2D-block distributed wavefunctions
+    /// @param kvec_d  k-point in direct coordinates (for phase factor)
+    /// @param psi_k   wavefunction coefficients C_k (2D-block distributed)
+    /// @param nbands_global  global number of bands
+    /// @param PI_sub  output: PI_sub[iat] is nbands×nbands Hermitian matrix (gathered to all procs)
+    ///                Only filled for constrained atoms; empty for unconstrained.
+    void cal_PI_sub(const ModuleBase::Vector3<double>& kvec_d,
+                    const std::complex<double>* psi_k,
+                    const int nbands_global,
+                    std::vector<std::vector<std::complex<double>>>& PI_sub) const;
+
   private:
     const UnitCell* ucell = nullptr;
 
@@ -154,6 +191,19 @@ class DeltaSpin<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     bool initialized = false;
     int spin_num = 1;
     std::vector<bool> update_lambda_;
+    /// Independent HR completion flag for DeltaSpin, decoupled from
+    /// the shared OperatorLCAO::hr_done to avoid cross-k-point accumulation.
+    bool sc_hr_done = false;
+
+    /// @brief Saved B_I overlap data for subspace projection optimization
+    /// For each constrained atom I, stores the overlaps <phi_mu|alpha_I_lm> organized by adjacent atoms
+    struct BI_AdjacentData {
+        int iat_adj;                                          ///< global atom index of adjacent atom
+        ModuleBase::Vector3<int> R_index;                     ///< cell index of adjacent atom
+        std::unordered_map<int, std::vector<double>> nlm;     ///< iw_global -> <phi_iw|alpha_I_lm>
+    };
+    std::vector<std::vector<BI_AdjacentData>> B_I_data;       ///< [iat][adj_index]
+    std::vector<int> B_I_nproj;                               ///< r = max_l_plus_1^2 per constrained atom
 };
 
 }
diff --git a/source/source_lcao/module_operator_lcao/operator_lcao.cpp b/source/source_lcao/module_operator_lcao/operator_lcao.cpp
index 1e4a5f728bd..0d815e78f4d 100644
--- a/source/source_lcao/module_operator_lcao/operator_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/operator_lcao.cpp
@@ -186,9 +186,9 @@ void OperatorLCAO<TK, TR>::init(const int ik_in) {
         case calculation_type::lcao_sc_lambda:
         {
             //update HR first
+            // Only contribute once per SCF iteration (when hr_done=false)
+            // or when lambda has changed (checked inside contributeHR)
             this->contributeHR();
-            //in cal_type=lcao_sc_mag, 
-            //this->contributeHk(ik_in);
             break;
         }
         case calculation_type::lcao_exx:
@@ -264,6 +264,7 @@ void OperatorLCAO<double, double>::contributeHk(int ik) {
         const int ncol = this->hsk->get_pv()->get_col_size();
         hamilt::folding_HR(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
     }
+
     ModuleBase::timer::end("OperatorLCAO", "contributeHk");
 }
 // contributeHk()
@@ -295,6 +296,7 @@ void OperatorLCAO<TK, TR>::contributeHk(int ik) {
             hamilt::folding_HR(*this->hR, this->hsk->get_hk(), this->kvec_d[ik], ncol, 0);
         }
     }
+
     ModuleBase::timer::end("OperatorLCAO", "contributeHk");
 }
 
diff --git a/source/source_lcao/module_operator_lcao/test/CMakeLists.txt b/source/source_lcao/module_operator_lcao/test/CMakeLists.txt
index a1c52935cf1..304cc92e327 100644
--- a/source/source_lcao/module_operator_lcao/test/CMakeLists.txt
+++ b/source/source_lcao/module_operator_lcao/test/CMakeLists.txt
@@ -82,10 +82,10 @@ AddTest(
 
 AddTest(
   TARGET MODULE_LCAO_operator_dftu_test
-  LIBS parameter ${math_libs} psi base device container 
-  SOURCES test_dftu.cpp ../dftu_lcao.cpp ../../module_hcontainer/func_folding.cpp 
-  ../../module_hcontainer/base_matrix.cpp ../../module_hcontainer/hcontainer.cpp ../../module_hcontainer/atom_pair.cpp  
-  ../../../source_basis/module_ao/parallel_orbitals.cpp 
+  LIBS parameter ${math_libs} psi base device container
+  SOURCES test_dftu.cpp ../dftu_lcao.cpp ../../module_hcontainer/func_folding.cpp
+  ../../module_hcontainer/base_matrix.cpp ../../module_hcontainer/hcontainer.cpp ../../module_hcontainer/atom_pair.cpp
+  ../../../source_basis/module_ao/parallel_orbitals.cpp
   ../../../source_basis/module_ao/ORB_atomic_lm.cpp
   tmp_mocks.cpp ../../../source_hamilt/operator.cpp
 )
diff --git a/source/source_lcao/module_operator_lcao/test/test_dftu.cpp b/source/source_lcao/module_operator_lcao/test/test_dftu.cpp
index 31adb426ad4..20723a11e6a 100644
--- a/source/source_lcao/module_operator_lcao/test/test_dftu.cpp
+++ b/source/source_lcao/module_operator_lcao/test/test_dftu.cpp
@@ -23,6 +23,28 @@ const hamilt::HContainer<double>* Plus_U::get_dmr(int ispin) const
     return tmp_DMR;
 }
 
+void Plus_U::get_locale_flat(const int iat, const int l, std::vector<double>& occ) const
+{
+    const int tlp1 = 2 * l + 1;
+    const int tlp1_2 = tlp1 * tlp1;
+    occ.resize(tlp1_2);
+    for (int i = 0; i < tlp1_2; i++)
+    {
+        occ[i] = locale[iat][l][0][0].c[i];
+    }
+}
+
+void Plus_U::set_locale_flat(const int iat, const int l, const int spin,
+                              const std::vector<double>& occ)
+{
+    const int tlp1 = 2 * l + 1;
+    const int tlp1_2 = tlp1 * tlp1;
+    for (int i = 0; i < tlp1_2 && i < static_cast<int>(occ.size()); i++)
+    {
+        locale[iat][l][0][spin].c[i] = occ[i];
+    }
+}
+
 //---------------------------------------
 // Unit test of Plus_U class
 // Plus_U is a derivative class of Operator, it is used to calculate the kinetic matrix
diff --git a/source/source_lcao/module_ri/RPA_LRI.hpp b/source/source_lcao/module_ri/RPA_LRI.hpp
index 17695b572ee..2c144a6509a 100644
--- a/source/source_lcao/module_ri/RPA_LRI.hpp
+++ b/source/source_lcao/module_ri/RPA_LRI.hpp
@@ -1271,7 +1271,7 @@ void RPA_LRI<T, Tdata>::out_coulomb_k(const UnitCell& ucell,
 // 			list_As_Vs.first, list_As_Vs.second[0],
 // 			{{"writable_Vws",true}});
 
-// 	// Vs[iat0][{iat1,cell1}]	按 (iat0,iat1) 分进程，每个进程有所有 cell1
+// 	// Vs[iat0][{iat1,cell1}]	distributed by (iat0,iat1), each process has all cell1
 // 	Vqs = FFT(Vs);
 // 	out_Vs(Vqs);
 
diff --git a/source/source_lcao/module_rt/solve_propagation.cpp b/source/source_lcao/module_rt/solve_propagation.cpp
index aa0a9f38371..540d16c3292 100644
--- a/source/source_lcao/module_rt/solve_propagation.cpp
+++ b/source/source_lcao/module_rt/solve_propagation.cpp
@@ -83,7 +83,7 @@ void solve_propagation(const Parallel_Orbitals* pv,
                        const double dt,
                        const std::complex<double>* Stmp,
                        const std::complex<double>* Htmp,
-                       const std::complex<double>* P_k, // <--- 接收 P_k
+                        const std::complex<double>* P_k, // <--- receives P_k
                        const std::complex<double>* psi_k_laststep,
                        std::complex<double>* psi_k)
 {
diff --git a/source/source_main/driver.cpp b/source/source_main/driver.cpp
index c22e4d08fba..2c16cac42ae 100644
--- a/source/source_main/driver.cpp
+++ b/source/source_main/driver.cpp
@@ -28,7 +28,6 @@ void Driver::init()
 
     // 2) Print the current time, since it may run a long time.
     time_t time_start = std::time(nullptr);
-    ModuleBase::timer::start();
 
     // 3) Welcome to the atomic world! Let's do some fancy stuff here.
     this->atomic_world();
diff --git a/source/source_main/main.cpp b/source/source_main/main.cpp
index 3f55e6e7b5e..58ca5b383c9 100644
--- a/source/source_main/main.cpp
+++ b/source/source_main/main.cpp
@@ -4,11 +4,11 @@
 //==========================================================
 
 #include "source_main/driver.h"
-#include "fftw3.h"
 #include "source_base/parallel_global.h"
 #include "source_io/parse_args.h"
 #include "source_io/module_parameter/parameter.h"
 #ifdef _OPENMP
+#include <fftw3.h>
 #include <omp.h>
 #endif
 
@@ -43,14 +43,16 @@ int main(int argc, char** argv)
     DD.init();
 
     /*
-    After running mpi version of abacus, release the mpi resources.
+    Clean up FFTW threads before MPI_Finalize to avoid OpenMPI 4.0.3
+    hwloc segfault: FFTW must release its hwloc resources before MPI
+    finalizes and frees the shared hwloc topology.
     */
-#ifdef __MPI
-    Parallel_Global::finalize_mpi();
-#endif
 #ifdef _OPENMP
     fftw_cleanup_threads();
 #endif
+#ifdef __MPI
+    Parallel_Global::finalize_mpi();
+#endif
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/source/source_pw/module_pwdft/deltaspin_pw.cpp b/source/source_pw/module_pwdft/deltaspin_pw.cpp
index 680ec26afc2..34bd5156148 100644
--- a/source/source_pw/module_pwdft/deltaspin_pw.cpp
+++ b/source/source_pw/module_pwdft/deltaspin_pw.cpp
@@ -20,20 +20,29 @@ bool run_deltaspin_lambda_loop(const int iter,
     spinconstrain::SpinConstrain<std::complex<double>>& sc
         = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
 
+    /// Case 0: linear_scan strategy - sweep lambda values for energy landscape mapping
+    /// This is a diagnostic/debugging mode that does NOT optimize lambda,
+    /// only records Mi vs lambda to lambda_scan_results.dat.
+    if (inp.sc_lambda_strategy == "linear_scan")
+    {
+        sc.run_lambda_linear_scan(iter);
+        return true;
+    }
+
     /// Case 1: Magnetic moments not yet converged and SCF is close to convergence.
     /// This is the first time we enter the lambda loop after SCF is nearly converged.
     if (!sc.mag_converged() && drho > 0 && drho < inp.sc_scf_thr)
     {
         /// Optimize lambda to get target magnetic moments
-        sc.run_lambda_loop(iter);
+        sc.run_lambda_loop(iter - 1);
         sc.set_mag_converged(true);
         return true;
     }
     /// Case 2: Magnetic moments already converged in previous iteration.
-    /// Continue to refine lambda in subsequent SCF iterations.
+    /// Re-run the lambda loop to update psi and charge density with current lambda.
     else if (sc.mag_converged())
     {
-        sc.run_lambda_loop(iter);
+        sc.run_lambda_loop(iter - 1);
         return true;
     }
 
diff --git a/source/source_pw/module_pwdft/dftu_pw.cpp b/source/source_pw/module_pwdft/dftu_pw.cpp
index 7d74449bfb3..97cb8d5ccce 100644
--- a/source/source_pw/module_pwdft/dftu_pw.cpp
+++ b/source/source_pw/module_pwdft/dftu_pw.cpp
@@ -6,14 +6,15 @@ namespace pw
 {
 
 void iter_init_dftu_pw(const int iter,
-                       const int istep,
-                       Plus_U& dftu,
-                       const void* psi,
-                       const ModuleBase::matrix& wg,
-                       const UnitCell& ucell,
-                       const Input_para& inp)
+                        const int istep,
+                        Plus_U& dftu,
+                        const void* psi,
+                        const ModuleBase::matrix& wg,
+                        const UnitCell& ucell,
+                        Charge_Mixing* p_chgmix,
+                        const int* isk)
 {
-    if (!inp.dft_plus_u)
+    if (!p_chgmix || !PARAM.inp.dft_plus_u)
     {
         return;
     }
@@ -25,7 +26,7 @@ void iter_init_dftu_pw(const int iter,
 
     if (dftu.omc != 2)
     {
-        dftu.cal_occ_pw(iter, psi, wg, ucell, inp.mixing_beta);
+        dftu.cal_occ_pw(iter, psi, wg, ucell, p_chgmix, isk);
     }
     dftu.output(ucell);
 }
diff --git a/source/source_pw/module_pwdft/dftu_pw.h b/source/source_pw/module_pwdft/dftu_pw.h
index db67834a188..94e24f31ff9 100644
--- a/source/source_pw/module_pwdft/dftu_pw.h
+++ b/source/source_pw/module_pwdft/dftu_pw.h
@@ -3,6 +3,7 @@
 
 #include "source_cell/unitcell.h"
 #include "source_base/matrix.h"
+#include "source_estate/module_charge/charge_mixing.h"
 
 struct Input_para;
 class Plus_U;
@@ -16,7 +17,8 @@ void iter_init_dftu_pw(const int iter,
                        const void* psi,
                        const ModuleBase::matrix& wg,
                        const UnitCell& ucell,
-                       const Input_para& inp);
+                       Charge_Mixing* p_chgmix,
+                       const int* isk);
 
 }
 
diff --git a/source/source_pw/module_pwdft/forces.cpp b/source/source_pw/module_pwdft/forces.cpp
index 2f2e1ea3a86..da2587a3dd8 100644
--- a/source/source_pw/module_pwdft/forces.cpp
+++ b/source/source_pw/module_pwdft/forces.cpp
@@ -52,6 +52,7 @@ void Forces<FPTYPE, Device>::cal_force(UnitCell& ucell,
     ModuleBase::matrix forcecc(nat, 3);
     ModuleBase::matrix forcenl(nat, 3);
     ModuleBase::matrix forcescc(nat, 3);
+    ModuleBase::matrix forcepaw(nat, 3);
     ModuleBase::matrix forceonsite(nat, 3);
 
     // Force due to local ionic potential
diff --git a/source/source_pw/module_pwdft/forces_onsite.cpp b/source/source_pw/module_pwdft/forces_onsite.cpp
index 2429a015515..255d33f9433 100644
--- a/source/source_pw/module_pwdft/forces_onsite.cpp
+++ b/source/source_pw/module_pwdft/forces_onsite.cpp
@@ -13,7 +13,7 @@ void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
                                           const ModuleBase::matrix& wg,
                                           const ModulePW::PW_Basis_K* wfc_basis,
 										  const UnitCell& ucell_in,
-										  const Plus_U &dftu, // mohan add 2025-11-06
+										  const Plus_U &dftu,
 										  const psi::Psi <std::complex<FPTYPE>, Device>* psi_in)
 {
     ModuleBase::TITLE("Forces", "cal_force_onsite");
@@ -23,7 +23,6 @@ void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
     }
     ModuleBase::timer::start("Forces", "cal_force_onsite");
 
-    // allocate memory for the force
     FPTYPE* force = nullptr;
     resmem_var_op()(force, ucell_in.nat * 3);
     base_device::memory::set_memory_op<FPTYPE, Device>()(force, 0.0, ucell_in.nat * 3);
@@ -31,9 +30,8 @@ void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
     auto* onsite_p = projectors::OnsiteProjector<FPTYPE, Device>::get_instance();
 
     const int nks = wfc_basis->nks;
-    for (int ik = 0; ik < nks; ik++) // loop k points
+    for (int ik = 0; ik < nks; ik++)
     {
-        // skip zero weights to speed up
         int nbands_occ = wg.nc;
         while (wg(ik, nbands_occ - 1) == 0.0)
         {
@@ -45,32 +43,25 @@ void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
         }
         const int npm = nbands_occ;
         onsite_p->get_fs_tools()->cal_becp(ik, npm);
-        // calculate becp = <psi|beta> for all beta functions
         for (int ipol = 0; ipol < 3; ipol++)
         {
-            // calculate dbecp = <psi|\nabla beta> for all beta functions
             onsite_p->get_fs_tools()->cal_dbecp_f(ik, npm, ipol);
         }
-        // calculate the force_i = \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i
-        // force for DFT+U
         if(PARAM.inp.dft_plus_u)
         {
-            onsite_p->get_fs_tools()->cal_force_dftu(ik, npm, force, 
-              dftu.orbital_corr.data(), dftu.get_eff_pot_pw(0), dftu.get_size_eff_pot_pw(), wg.c);
+            onsite_p->cal_force_onsite_dftu(ik, npm, force, dftu, nks, wg.c);
         }
         if(PARAM.inp.sc_mag_switch)
         {
             spinconstrain::SpinConstrain<std::complex<double>>& sc = 
               spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
-            const std::vector<ModuleBase::Vector3<double>>& lambda = sc.get_sc_lambda();
-            onsite_p->get_fs_tools()->cal_force_dspin(ik, npm, force, lambda.data(), wg.c);
+            onsite_p->cal_force_onsite_dspin(ik, npm, force, sc.get_sc_lambda().data(), wg.c);
         }
         
-    } // end ik
+    }
 
     syncmem_var_d2h_op()(force_onsite.c, force, force_onsite.nr * force_onsite.nc);
     delmem_var_op()(force);
-    // sum up force_onsite from all processors
     Parallel_Reduce::reduce_all(force_onsite.c, force_onsite.nr * force_onsite.nc);
 
     ModuleBase::timer::end("Forces", "cal_force_onsite");
diff --git a/source/source_pw/module_pwdft/kernels/cuda/force_op.cu b/source/source_pw/module_pwdft/kernels/cuda/force_op.cu
index 1466ba47acc..91d263b78c0 100644
--- a/source/source_pw/module_pwdft/kernels/cuda/force_op.cu
+++ b/source/source_pw/module_pwdft/kernels/cuda/force_op.cu
@@ -454,6 +454,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int& nbands,
                                                                   const int& ik,
                                                                   const int& nkb,
+                                                                  const int& npol,
                                                                   const int* atom_nh,
                                                                   const int* atom_na,
                                                                   const FPTYPE& tpiba,
@@ -493,6 +494,7 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
                                                                   const int& nbands,
                                                                   const int& ik,
                                                                   const int& nkb,
+                                                                  const int& npol,
                                                                   const int* atom_nh,
                                                                   const int* atom_na,
                                                                   const FPTYPE& tpiba,
diff --git a/source/source_pw/module_pwdft/kernels/cuda/onsite_op.cu b/source/source_pw/module_pwdft/kernels/cuda/onsite_op.cu
index 68aee02047d..654cd257a8e 100644
--- a/source/source_pw/module_pwdft/kernels/cuda/onsite_op.cu
+++ b/source/source_pw/module_pwdft/kernels/cuda/onsite_op.cu
@@ -21,14 +21,26 @@ __global__ void onsite_op(const int npm,
 {
     const int ip = blockIdx.x;
     const int nbands = npm / npol;
-    for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+    int iat = ip_iat[ip];
+    if (npol == 2)
     {
-        int ib2 = ib * npol;
-        int iat = ip_iat[ip];
-        const int psind = ip * npm + ib2;
-        const int becpind = ib2 * tnp + ip;
-        ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp];
-        ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp];
+        for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+        {
+            int ib2 = ib * npol;
+            const int psind = ip * npm + ib2;
+            const int becpind = ib2 * tnp + ip;
+            ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp];
+            ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp];
+        }
+    }
+    else
+    {
+        for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+        {
+            const int psind = ip * npm + ib;
+            const int becpind = ib * tnp + ip;
+            ps[psind] += lambda_coeff[iat] * becp[becpind];
+        }
     }
 }
 
diff --git a/source/source_pw/module_pwdft/kernels/cuda/stress_op.cu b/source/source_pw/module_pwdft/kernels/cuda/stress_op.cu
index 58a8e219e5c..09ce05fb6ad 100644
--- a/source/source_pw/module_pwdft/kernels/cuda/stress_op.cu
+++ b/source/source_pw/module_pwdft/kernels/cuda/stress_op.cu
@@ -1051,6 +1051,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
@@ -1084,6 +1085,7 @@ void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
diff --git a/source/source_pw/module_pwdft/kernels/force_op.cpp b/source/source_pw/module_pwdft/kernels/force_op.cpp
index 0e0c34ccdde..10435cddf98 100644
--- a/source/source_pw/module_pwdft/kernels/force_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/force_op.cpp
@@ -292,6 +292,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& nbands,
                     const int& ik,
                     const int& nkb,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE& tpiba,
@@ -321,7 +322,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
             {
                 for (int ib = 0; ib < nbands_occ; ib++)
                 {
-                    const int ib2 = ib*2;
+                    const int ib2 = ib*npol;
                     FPTYPE local_force[3] = {0, 0, 0};
                     FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
                     int iat = iat0 + ia;
@@ -330,36 +331,47 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     {
                         const int inkb = sum + ip;
                         const int m = ip - ip_begin;
-                        // out<<"\n ps = "<<ps;
                         for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
                         {
                             const int jnkb = sum + ip2;
                             const int m2 = ip2 - ip_begin;
-                            std::complex<FPTYPE> ps[4];
-                            for(int i = 0; i < 4; i++)
+                            if(npol == 2)
                             {
-                                ps[i] = vu[(i * tlp1_2 + m * tlp1 + m2)];
-                            }
+                                std::complex<FPTYPE> ps[4];
+                                for(int i = 0; i < 4; i++)
+                                {
+                                    ps[i] = vu[(i * tlp1_2 + m * tlp1 + m2)];
+                                }
 
-                            for (int ipol = 0; ipol < 3; ipol++)
-                            {
-                                const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
-                                const int index1 = ib2 * nkb + jnkb;
-                                const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
-                                const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
-                                const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
-                                const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                                for (int iforce = 0; iforce < 3; iforce++)
+                                {
+                                    const int index0 = iforce * nbands * npol * nkb + ib2 * nkb + inkb;
+                                    const int index1 = ib2 * nkb + jnkb;
+                                    const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                                    const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                                    const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
 
-                                local_force[ipol] -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+                                    local_force[iforce] -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+                                }
+                            }
+                            else if(npol == 1)
+                            {
+                                for (int iforce = 0; iforce < 3; iforce++)
+                                {
+                                    const int index0 = iforce * nbands * npol * nkb + ib2 * nkb + inkb;
+                                    const int index1 = ib2 * nkb + jnkb;
+                                    local_force[iforce] -= fac * (vu[(m * tlp1 + m2)] * conj(dbecp[index0]) * becp[index1]).real();
+                                }
                             }
                         }
                     }
-                    for (int ipol = 0; ipol < 3; ++ipol)
+                    for (int iforce = 0; iforce < 3; ++iforce)
                     {
-                        force[iat * forcenl_nc + ipol] += local_force[ipol];
+                        force[iat * forcenl_nc + iforce] += local_force[iforce];
                     }
                 }
-                vu += 4 * tlp1_2;// step for vu
+                vu += npol * npol * tlp1_2;// step for vu
             } // end ia
             iat0 += atom_na[it];
             sum0 += atom_na[it] * nproj;
@@ -374,6 +386,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& nbands,
                     const int& ik,
                     const int& nkb,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE& tpiba,
@@ -398,25 +411,43 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
                 const std::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
                 for (int ib = 0; ib < nbands_occ; ib++)
                 {
-                    const int ib2 = ib*2;
                     FPTYPE local_force[3] = {0, 0, 0};
                     FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
-                    for (int ip = 0; ip < nproj; ip++)
+                    if (npol == 2)
                     {
-                        const int inkb = sum + ip;
+                        const int ib2 = ib * 2;
+                        for (int ip = 0; ip < nproj; ip++)
+                        {
+                            const int inkb = sum + ip;
 
-                        for (int ipol = 0; ipol < 3; ipol++)
+                            for (int ipol = 0; ipol < 3; ipol++)
+                            {
+                                const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                                const int index1 = ib2 * nkb + inkb;
+                                const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                                const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                                const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                                const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+
+                                local_force[ipol] -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
+                            }
+                        } // ip
+                    }
+                    else if (npol == 1)
+                    {
+                        for (int ip = 0; ip < nproj; ip++)
                         {
-                            const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
-                            const int index1 = ib2 * nkb + inkb;
-                            const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
-                            const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
-                            const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
-                            const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+                            const int inkb = sum + ip;
 
-                            local_force[ipol] -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
-                        }
-                    }//ip
+                            for (int ipol = 0; ipol < 3; ipol++)
+                            {
+                                const int index0 = ipol * nbands * nkb + ib * nkb + inkb;
+                                const int index1 = ib * nkb + inkb;
+                                const FPTYPE dbb = (conj(dbecp[index0]) * becp[index1]).real();
+                                local_force[ipol] -= fac * lambda[iat*3+2] * dbb;
+                            }
+                        } // ip
+                    }
                     for (int ipol = 0; ipol < 3; ++ipol)
                     {
                         force[iat * forcenl_nc + ipol] += local_force[ipol];
diff --git a/source/source_pw/module_pwdft/kernels/force_op.h b/source/source_pw/module_pwdft/kernels/force_op.h
index 0e1d51666bf..67c2e85f625 100644
--- a/source/source_pw/module_pwdft/kernels/force_op.h
+++ b/source/source_pw/module_pwdft/kernels/force_op.h
@@ -1,7 +1,8 @@
 #ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
 #define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_source_pw_HAMILT_PWDFT_KERNELS_FORCE_OP_H
+#include "source_io/module_parameter/parameter.h"
 
-#include "source_base/module_device/types.h"
+#include "source_psi/psi.h"
 
 #include <complex>
 
@@ -120,6 +121,7 @@ struct cal_force_nl_op
                     const int& nbands,
                     const int& ik,
                     const int& nkb,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE& tpiba,
@@ -138,6 +140,7 @@ struct cal_force_nl_op
                     const int& nbands,
                     const int& ik,
                     const int& nkb,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE& tpiba,
@@ -249,6 +252,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& nbands,
                     const int& ik,
                     const int& nkb,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE& tpiba,
@@ -267,6 +271,7 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& nbands,
                     const int& ik,
                     const int& nkb,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE& tpiba,
diff --git a/source/source_pw/module_pwdft/kernels/onsite_op.cpp b/source/source_pw/module_pwdft/kernels/onsite_op.cpp
index c9d7d14432c..8ac4e8fb846 100644
--- a/source/source_pw/module_pwdft/kernels/onsite_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/onsite_op.cpp
@@ -16,23 +16,42 @@ struct onsite_ps_op<FPTYPE, base_device::DEVICE_CPU>
                     std::complex<FPTYPE>* ps,
                     const std::complex<FPTYPE>* becp)
     {
+        if(npol == 2)
+        {
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
-        for (int ib = 0; ib < npm / npol; ib++)
+            for (int ib = 0; ib < npm / npol; ib++)
+            {
+                for (int ip = 0; ip < tnp; ip++)
+                {
+                    int ib2 = ib * npol;
+                    int iat = ip_iat[ip];
+                    const int psind = ip * npm + ib2;
+                    const int becpind = ib2 * tnp + ip;
+                    ps[psind] += lambda_array[iat * 4] * becp[becpind]
+                                + lambda_array[iat * 4 + 2] * becp[becpind + tnp];
+                    ps[psind + 1] += lambda_array[iat * 4 + 1] * becp[becpind]
+                                + lambda_array[iat * 4 + 3] * becp[becpind + tnp];
+                } // end ip
+            } // end ib
+        }
+        else // npol == 1, nspin=1 or nspin=2
         {
-            for (int ip = 0; ip < tnp; ip++)
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+            for (int ib = 0; ib < npm; ib++)
             {
-                int ib2 = ib * npol;
-                int iat = ip_iat[ip];
-                const int psind = ip * npm + ib2;
-                const int becpind = ib2 * tnp + ip;
-                ps[psind] += lambda_array[iat * 4] * becp[becpind] 
-                            + lambda_array[iat * 4 + 2] * becp[becpind + tnp];
-                ps[psind + 1] += lambda_array[iat * 4 + 1] * becp[becpind] 
-                            + lambda_array[iat * 4 + 3] * becp[becpind + tnp];
-            } // end ip
-        } // end ib
+                for (int ip = 0; ip < tnp; ip++)
+                {
+                    int iat = ip_iat[ip];
+                    const int psind = ip * npm + ib;
+                    const int becpind = ib * tnp + ip;
+                    ps[psind] += lambda_array[iat] * becp[becpind];
+                } // end ip
+            } // end ib
+        }
     };
 
     // kernel for DFT+U calculation
@@ -48,6 +67,8 @@ struct onsite_ps_op<FPTYPE, base_device::DEVICE_CPU>
       std::complex<FPTYPE>* ps,
       const std::complex<FPTYPE>* becp)
   {
+    if(npol == 2)
+    {
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2)
 #endif
@@ -78,6 +99,35 @@ struct onsite_ps_op<FPTYPE, base_device::DEVICE_CPU>
                 }
             } // end ip
         } // end ib
+    }
+    else // npol == 1, nspin=1 or nspin=2
+    {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+        for (int ib = 0; ib < npm; ib++)
+        {
+            for (int ip = 0; ip < tnp; ip++)
+            {
+                int m1 = ip_m[ip];
+                if(m1 < 0) continue;
+                int iat = ip_iat[ip];
+                const std::complex<FPTYPE>* vu_iat = vu + vu_begin_iat[iat];
+                int orb_l = orb_l_iat[iat];
+                int tlp1 = 2 * orb_l + 1;
+                int ip2_begin = ip - m1;
+                int ip2_end = ip - m1 + tlp1;
+                const int psind = ip * npm + ib;
+                for(int ip2 = ip2_begin;ip2<ip2_end;ip2++)
+                {
+                    const int becpind = ib * tnp + ip2;
+                    int m2 = ip_m[ip2];
+                    const int index_mm = m1 * tlp1 + m2;
+                    ps[psind] += vu_iat[index_mm] * becp[becpind];
+                }
+            } // end ip
+        } // end ib
+    }
   }
 };
 
diff --git a/source/source_pw/module_pwdft/kernels/rocm/onsite_op.hip.cu b/source/source_pw/module_pwdft/kernels/rocm/onsite_op.hip.cu
index 0826368deac..83e26e309f4 100644
--- a/source/source_pw/module_pwdft/kernels/rocm/onsite_op.hip.cu
+++ b/source/source_pw/module_pwdft/kernels/rocm/onsite_op.hip.cu
@@ -21,14 +21,26 @@ __global__ void onsite_op(const int npm,
 {
     const int ip = blockIdx.x;
     const int nbands = npm / npol;
-    for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+    int iat = ip_iat[ip];
+    if (npol == 2)
     {
-        int ib2 = ib * npol;
-        int iat = ip_iat[ip];
-        const int psind = ip * npm + ib2;
-        const int becpind = ib2 * tnp + ip;
-        ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp];
-        ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp];
+        for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+        {
+            int ib2 = ib * npol;
+            const int psind = ip * npm + ib2;
+            const int becpind = ib2 * tnp + ip;
+            ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp];
+            ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp];
+        }
+    }
+    else
+    {
+        for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+        {
+            const int psind = ip * npm + ib;
+            const int becpind = ib * tnp + ip;
+            ps[psind] += lambda_coeff[iat] * becp[becpind];
+        }
     }
 }
 
diff --git a/source/source_pw/module_pwdft/kernels/stress_op.cpp b/source/source_pw/module_pwdft/kernels/stress_op.cpp
index 169b9c932c3..d7b4334cfc5 100644
--- a/source/source_pw/module_pwdft/kernels/stress_op.cpp
+++ b/source/source_pw/module_pwdft/kernels/stress_op.cpp
@@ -252,6 +252,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
@@ -263,7 +264,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
     {
 //	std::cout << " DFT+U kernel called " << std::endl;
         FPTYPE local_stress = 0;
-        int iat = 0, sum = 0;
+        int sum = 0;
         for (int it = 0; it < ntype; it++)
         {
             const int orbital_l = orbital_corr[it];
@@ -281,35 +282,53 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             {
                 for (int ib = 0; ib < nbands_occ; ib++)
                 {
-                    const int ib2 = ib*2;
+                    const int ib2 = ib*npol;
                     FPTYPE fac = d_wg[ik * wg_nc + ib];
-                    for (int ip1 = ip_begin; ip1 < ip_end; ip1++)
+                    switch (npol)
                     {
-                        const int m1 = ip1 - ip_begin;
-                        const int inkb1 = ib2 * nkb + sum + ia * nproj + ip1;
-                        // out<<"\n ps = "<<ps;
-                        for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
+                    case 1:
+                        for (int ip1 = ip_begin; ip1 < ip_end; ip1++)
                         {
-                            const int m2 = ip2 - ip_begin;
-                            std::complex<FPTYPE> ps[4];
-                            for(int i = 0; i < 4; i++)
+                            const int m1 = ip1 - ip_begin;
+                            const int inkb1 = ib2 * nkb + sum + ia * nproj + ip1;
+                            for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
                             {
-                                ps[i] = vu[(i * tlp1_2 + m1 * tlp1 + m2)];
+                                const int m2 = ip2 - ip_begin;
+                                const int inkb2 = ib2 * nkb + sum + ia * nproj + ip2;
+                                local_stress -= fac * (vu[m1 * tlp1 + m2] * (conj(dbecp[inkb1]) * becp[inkb2])).real();
                             }
-                            const int inkb2 = ib2 * nkb + sum + ia * nproj + ip2;
-
-                            const std::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb2];
-                            const std::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb2];
-                            const std::complex<FPTYPE> dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb2];
-                            const std::complex<FPTYPE> dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb2];
-                            local_stress -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
-                        }
-                    } // end ip
+                        } // end ip
+                        break;
+                    case 2:
+                        for (int ip1 = ip_begin; ip1 < ip_end; ip1++)
+                        {
+                            const int m1 = ip1 - ip_begin;
+                            const int inkb1 = ib2 * nkb + sum + ia * nproj + ip1;
+                            for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
+                            {
+                                const int m2 = ip2 - ip_begin;
+                                std::complex<FPTYPE> ps[4];
+                                for(int i = 0; i < 4; i++)
+                                {
+                                    ps[i] = vu[(i * tlp1_2 + m1 * tlp1 + m2)];
+                                }
+                                const int inkb2 = ib2 * nkb + sum + ia * nproj + ip2;
+
+                                const std::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb2];
+                                const std::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb2];
+                                const std::complex<FPTYPE> dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb2];
+                                const std::complex<FPTYPE> dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb2];
+                                local_stress -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+                            }
+                        } // end ip
+                        break;
+                    default:
+                        break;
+                    }
                 }// ib
-                vu += 4 * tlp1_2;// step for vu
+                vu += npol * npol * tlp1_2;// step for vu
             }// ia
             sum += atom_na[it] * nproj;
-            iat += atom_na[it];
         } // end it
         *stress += local_stress;
     };
@@ -320,6 +339,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
@@ -336,25 +356,43 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
             for (int ia = 0; ia < atom_na[it]; ia++)
             {
                 int iat = iat0 + ia;
-                const std::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
-                const std::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
-                const std::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
-                const std::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
-                for (int ib = 0; ib < nbands_occ; ib++)
+                if (npol == 2)
                 {
-                    const int ib2 = ib*2;
-                    FPTYPE fac = d_wg[ik * wg_nc + ib];
-                    for (int ip = 0; ip < nproj; ip++)
+                    const std::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
+                    const std::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
+                    const std::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
+                    const std::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
+                    for (int ib = 0; ib < nbands_occ; ib++)
                     {
-                        const int inkb1 = ib2 * nkb + sum + ia * nproj + ip;
-
-                        const std::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb1];
-                        const std::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb1];
-                        const std::complex<FPTYPE> dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb1];
-                        const std::complex<FPTYPE> dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb1];
-                        local_stress -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
-                    } // end ip
-                }// ib
+                        const int ib2 = ib * 2;
+                        FPTYPE fac = d_wg[ik * wg_nc + ib];
+                        for (int ip = 0; ip < nproj; ip++)
+                        {
+                            const int inkb1 = ib2 * nkb + sum + ia * nproj + ip;
+
+                            const std::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb1];
+                            const std::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb1];
+                            const std::complex<FPTYPE> dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb1];
+                            const std::complex<FPTYPE> dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb1];
+                            local_stress -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
+                        } // end ip
+                    } // ib
+                }
+                else if (npol == 1)
+                {
+                    const FPTYPE coefficients0(lambda[iat*3+2]);
+                    for (int ib = 0; ib < nbands_occ; ib++)
+                    {
+                        FPTYPE fac = d_wg[ik * wg_nc + ib];
+                        for (int ip = 0; ip < nproj; ip++)
+                        {
+                            const int inkb = ib * nkb + sum + ia * nproj + ip;
+
+                            const FPTYPE dbb = (conj(dbecp[inkb]) * becp[inkb]).real();
+                            local_stress -= fac * coefficients0 * dbb;
+                        } // end ip
+                    } // ib
+                }
             }// ia
             sum += atom_na[it] * nproj;
             iat0 += atom_na[it];
diff --git a/source/source_pw/module_pwdft/kernels/stress_op.h b/source/source_pw/module_pwdft/kernels/stress_op.h
index 995557ffa05..b5d60e42a9c 100644
--- a/source/source_pw/module_pwdft/kernels/stress_op.h
+++ b/source/source_pw/module_pwdft/kernels/stress_op.h
@@ -1,7 +1,7 @@
 #ifndef SRC_PW_STRESS_MULTI_DEVICE_H
 #define SRC_PW_STRESS_MULTI_DEVICE_H
+#include "source_io/module_parameter/parameter.h"
 
-#include "source_base/module_device/types.h"
 #include "source_psi/psi.h"
 
 #include <complex>
@@ -129,6 +129,7 @@ struct cal_stress_nl_op
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
@@ -144,6 +145,7 @@ struct cal_stress_nl_op
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
@@ -334,6 +336,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
@@ -349,6 +352,7 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const int& ntype,
                     const int& wg_nc,
                     const int& ik,
+                    const int& npol,
                     const int* atom_nh,
                     const int* atom_na,
                     const FPTYPE* d_wg,
diff --git a/source/source_pw/module_pwdft/onsite_proj.cpp b/source/source_pw/module_pwdft/onsite_proj.cpp
index f9bd19dee03..779d0d101d0 100644
--- a/source/source_pw/module_pwdft/onsite_proj.cpp
+++ b/source/source_pw/module_pwdft/onsite_proj.cpp
@@ -6,6 +6,9 @@
 #include <tuple>
 #include "source_pw/module_pwdft/onsite_proj.h"
 #include "source_pw/module_pwdft/onsite_proj_print.h"
+#include "source_lcao/module_dftu/dftu.h"
+#include "source_lcao/module_deltaspin/spin_constrain.h"
+#include "source_io/module_parameter/parameter.h"
 
 #include "source_base/projgen.h"
 #include "source_base/kernels/math_kernel_op.h"
@@ -111,6 +114,7 @@ void projectors::OnsiteProjector<T, Device>::init(const std::string& orbital_dir
     {
         this->ucell = ucell_in;
         this->ntype = ucell_in->ntype;
+        this->isk_ = kv.isk.data();
 
         this->pw_basis_ = &pw_basis;
         this->sf_ = &sf;
@@ -287,6 +291,7 @@ void projectors::OnsiteProjector<T, Device>::tabulate_atomic(const int ik, const
     // CACHE 1 - if cache the tab_, <G+k|p> can be reused for SCF and RELAX calculation
     // [in] pw_basis, ik, omega, tpiba, irow2it
     this->ik_ = ik;
+    this->becp_ready_ = false;
     this->npw_ = pw_basis_->npwk[ik];
     this->npwx_ = pw_basis_->npwk_max;
     // std::vector<ModuleBase::Vector3<double>> q(this->npw_);
@@ -340,7 +345,8 @@ void projectors::OnsiteProjector<T, Device>::tabulate_atomic(const int ik, const
 template<typename T, typename Device>
 void projectors::OnsiteProjector<T, Device>::overlap_proj_psi( 
                     const int npm,
-                    const std::complex<double>* ppsi)
+                    const std::complex<double>* ppsi,
+                    const int ld_psi)
 {
     ModuleBase::timer::start("OnsiteProj", "overlap");
     // STAGE 3 - cal_becp
@@ -398,11 +404,13 @@ void projectors::OnsiteProjector<T, Device>::overlap_proj_psi(
             this->h_becp = this->becp;
         }
     }
-    this->fs_tools->cal_becp(ik_, npm/npol, this->becp, ppsi); // in cal_becp, npm should be the one not multiplied by npol
+    this->fs_tools->cal_becp(ik_, npm/npol, this->becp, ppsi, ld_psi > 0 ? ld_psi : this->npwx_); // in cal_becp, npm should be the one not multiplied by npol
     if(this->device == base_device::GpuDevice)
     {
         syncmem_complex_d2h_op()(h_becp, this->becp, this->size_becp);
     }
+    this->becp_ready_ = true;
+    this->ik_becp_ = this->ik_;
     ModuleBase::timer::end("OnsiteProj", "overlap");
 }
 
@@ -582,6 +590,46 @@ void projectors::OnsiteProjector<T, Device>::cal_occupations(
     ModuleBase::timer::end("OnsiteProj", "cal_occupation");
 }
 
+template <typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::cal_force_onsite_dftu(int ik, int npm, T* force,
+                                                        const Plus_U& dftu, int nks,
+                                                        const double* wg_ik) const
+{
+    const int isk_val = this->isk_ ? this->isk_[ik] : 0;
+    const std::complex<double>* vu_ptr = dftu.get_eff_pot_pw_spin(isk_val);
+    const int vu_size = dftu.get_size_eff_pot_pw_spin();
+    this->fs_tools->cal_force_dftu(ik, npm, force,
+        dftu.get_orbital_corr_data(), vu_ptr, vu_size, wg_ik);
+}
+
+template <typename T, typename Device>
+double projectors::OnsiteProjector<T, Device>::cal_stress_onsite_dftu(int ik, int npm,
+                                                           const Plus_U& dftu, int nks,
+                                                           const double* wg_ik) const
+{
+    const int isk_val = this->isk_ ? this->isk_[ik] : 0;
+    const std::complex<double>* vu_ptr = dftu.get_eff_pot_pw_spin(isk_val);
+    const int vu_size = dftu.get_size_eff_pot_pw_spin();
+    return this->fs_tools->cal_stress_dftu(ik, npm,
+        dftu.get_orbital_corr_data(), vu_ptr, vu_size, wg_ik);
+}
+
+template <typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::cal_force_onsite_dspin(int ik, int npm, T* force,
+                                                         const ModuleBase::Vector3<double>* lambda,
+                                                         const double* wg_ik) const
+{
+    this->fs_tools->cal_force_dspin(ik, npm, force, lambda, wg_ik);
+}
+
+template <typename T, typename Device>
+double projectors::OnsiteProjector<T, Device>::cal_stress_onsite_dspin(int ik, int npm,
+                                                            const ModuleBase::Vector3<double>* lambda,
+                                                            const double* wg_ik) const
+{
+    return this->fs_tools->cal_stress_dspin(ik, npm, lambda, wg_ik);
+}
+
 template class projectors::OnsiteProjector<double, base_device::DEVICE_CPU>;
 #if ((defined __CUDA) || (defined __ROCM))
 template class projectors::OnsiteProjector<double, base_device::DEVICE_GPU>;
diff --git a/source/source_pw/module_pwdft/onsite_proj.h b/source/source_pw/module_pwdft/onsite_proj.h
index 34c39e1fcd3..fdb83355ac3 100644
--- a/source/source_pw/module_pwdft/onsite_proj.h
+++ b/source/source_pw/module_pwdft/onsite_proj.h
@@ -7,6 +7,7 @@
 #include "source_pw/module_pwdft/radial_proj.h"
 #include "source_psi/psi.h"
 #include "source_pw/module_pwdft/onsite_proj_tools.h"
+#include "source_lcao/module_dftu/dftu.h"
 
 #include <string>
 #include <vector>
@@ -43,9 +44,13 @@ namespace projectors
          */
         void tabulate_atomic(const int ik, const char grad = 'n');
         
+        /// compute becp = <alpha|psi>; ld_psi is the leading dimension of psi
+        /// (defaults to npwx if 0, but should be ngk[ik] when called from
+        /// the Davidson/CG solver where psi stride varies per k-point)
         void overlap_proj_psi(
                     const int npm,
-                    const std::complex<double>* ppsi
+                    const std::complex<double>* ppsi,
+                    const int ld_psi = 0
                     );
         void read_abacus_orb(std::ifstream& ifs,
                             std::string& elem,
@@ -81,8 +86,31 @@ namespace projectors
         int get_npwx() const { return npwx_; }
         const int& get_nh(int iat) const { return iat_nh[iat]; }
 
+        bool is_becp_ready(int ik) const { return becp_ready_ && ik_becp_ == ik; }
+        void invalidate_becp() { becp_ready_ = false; }
+
         hamilt::Onsite_Proj_tools<T, Device>* get_fs_tools() const { return fs_tools; }
 
+        /// high-level: compute DFT+U force contribution for one k-point
+        void cal_force_onsite_dftu(int ik, int npm, T* force,
+                                   const Plus_U& dftu, int nks,
+                                   const double* wg_ik) const;
+
+        /// high-level: compute DFT+U stress contribution for one k-point
+        double cal_stress_onsite_dftu(int ik, int npm,
+                                      const Plus_U& dftu, int nks,
+                                      const double* wg_ik) const;
+
+        /// high-level: compute DeltaSpin force contribution for one k-point
+        void cal_force_onsite_dspin(int ik, int npm, T* force,
+                                    const ModuleBase::Vector3<double>* lambda,
+                                    const double* wg_ik) const;
+
+        /// high-level: compute DeltaSpin stress contribution for one k-point
+        double cal_stress_onsite_dspin(int ik, int npm,
+                                       const ModuleBase::Vector3<double>* lambda,
+                                       const double* wg_ik) const;
+
         private:
         OnsiteProjector(){};
         ~OnsiteProjector();
@@ -105,6 +133,8 @@ namespace projectors
         int npw_ = 0;
         int npwx_ = 0;
         int ik_ = 0;
+        bool becp_ready_ = false;
+        int ik_becp_ = -1;
         std::vector<std::vector<int>> it2ia;
         std::vector<double> rgrid;
         std::vector<std::vector<double>> projs;
@@ -114,6 +144,8 @@ namespace projectors
 
         const UnitCell* ucell = nullptr;
 
+        const int* isk_ = nullptr;  ///< spin index per k-point (from K_Vectors)
+
         const ModulePW::PW_Basis_K* pw_basis_ = nullptr;             // level1: the plane wave basis, need ik
         Structure_Factor* sf_ = nullptr;                             // level2: the structure factor calculator
         int ntype = 0;
diff --git a/source/source_pw/module_pwdft/onsite_proj_tools.cpp b/source/source_pw/module_pwdft/onsite_proj_tools.cpp
index 488b6e76177..8ee2cda2809 100644
--- a/source/source_pw/module_pwdft/onsite_proj_tools.cpp
+++ b/source/source_pw/module_pwdft/onsite_proj_tools.cpp
@@ -280,7 +280,8 @@ template <typename FPTYPE, typename Device>
 void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
                                                  int npm,
                                                  std::complex<FPTYPE>* becp_in,
-                                                 const std::complex<FPTYPE>* ppsi_in)
+                                                 const std::complex<FPTYPE>* ppsi_in,
+                                                 int npwx)
 {
     ModuleBase::TITLE("Onsite_Proj_tools", "cal_becp");
     ModuleBase::timer::start("Onsite_Proj_tools", "cal_becp");
@@ -436,7 +437,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
               this->ppcell_vkb,
               npw,
               ppsi,
-              this->max_npw,
+              npwx > 0 ? npwx : this->max_npw,
               &ModuleBase::ZERO,
               becp_tmp,
               this->nkb);
@@ -832,6 +833,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dftu(int ik,
         d_wg = const_cast<FPTYPE*>(h_wg);
     }
     const int force_nc = 3;
+    const int npol = this->ucell_->get_npol();
     cal_force_nl_op<FPTYPE, Device>()(this->ctx,
                                       npm,
                                       this->nbands,
@@ -840,6 +842,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dftu(int ik,
                                       this->nbands,
                                       ik,
                                       nkb,
+                                      npol,
                                       atom_nh,
                                       atom_na,
                                       this->ucell_->tpiba,
@@ -887,6 +890,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dspin(int ik,
         d_wg = const_cast<FPTYPE*>(h_wg);
     }
     const int force_nc = 3;
+    const int npol = this->ucell_->get_npol();
     cal_force_nl_op<FPTYPE, Device>()(this->ctx,
                                       npm,
                                       this->nbands,
@@ -895,6 +899,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dspin(int ik,
                                       this->nbands,
                                       ik,
                                       nkb,
+                                      npol,
                                       atom_nh,
                                       atom_na,
                                       this->ucell_->tpiba,
@@ -921,6 +926,7 @@ double Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
                                                           const FPTYPE* h_wg)
 {
     double stress_out = 0.0;
+    const int npol = this->ucell_->get_npol();
     
     int* orb_corr_tmp = nullptr;
     std::complex<FPTYPE>* vu_tmp = nullptr;
@@ -949,6 +955,7 @@ double Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
                            this->ntype,
                            this->nbands,
                            ik,
+                           npol,
                            atom_nh,
                            atom_na,
                            d_wg,
@@ -963,7 +970,6 @@ double Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
         delmem_var_op()(stress_device);
         delmem_complex_op()(vu_tmp);
         delmem_int_op()(orb_corr_tmp);
-	std::cout << "BUG: DFT+U (GPU) stress_out = " << stress_out << std::endl;
     }
     else
 #endif
@@ -978,6 +984,7 @@ double Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
                            this->ntype,
                            this->nbands,
                            ik,
+                           npol,
                            atom_nh,
                            atom_na,
                            d_wg,
@@ -999,6 +1006,7 @@ double Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dspin(int ik,
                                                            const FPTYPE* h_wg)
 {
     double stress_out = 0.0;
+    const int npol = this->ucell_->get_npol();
     
     std::vector<FPTYPE> lambda_array(this->ucell_->nat * 3);
     for (int iat = 0; iat < this->ucell_->nat; iat++)
@@ -1027,6 +1035,7 @@ double Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dspin(int ik,
                            this->ntype,
                            this->nbands,
                            ik,
+                           npol,
                            atom_nh,
                            atom_na,
                            d_wg,
@@ -1053,6 +1062,7 @@ double Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dspin(int ik,
                            this->ntype,
                            this->nbands,
                            ik,
+                           npol,
                            atom_nh,
                            atom_na,
                            d_wg,
diff --git a/source/source_pw/module_pwdft/onsite_proj_tools.h b/source/source_pw/module_pwdft/onsite_proj_tools.h
index e877a85070c..0b7ef73b83f 100644
--- a/source/source_pw/module_pwdft/onsite_proj_tools.h
+++ b/source/source_pw/module_pwdft/onsite_proj_tools.h
@@ -62,7 +62,7 @@ class Onsite_Proj_tools
     /**
      * @brief calculate the becp = <psi|beta> for all beta functions
      */
-    void cal_becp(int ik, int npm, std::complex<FPTYPE>* becp_in = nullptr, const std::complex<FPTYPE>* ppsi_in = nullptr);
+    void cal_becp(int ik, int npm, std::complex<FPTYPE>* becp_in = nullptr, const std::complex<FPTYPE>* ppsi_in = nullptr, int npwx = 0);
     /**
      * @brief calculate the dbecp_{ij} = <psi|\partial beta/\partial varepsilon_{ij}> for all beta functions
      *       stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also calculated
diff --git a/source/source_pw/module_pwdft/op_pw_proj.cpp b/source/source_pw/module_pwdft/op_pw_proj.cpp
index 8c7cddfc89c..5294a2b7de6 100644
--- a/source/source_pw/module_pwdft/op_pw_proj.cpp
+++ b/source/source_pw/module_pwdft/op_pw_proj.cpp
@@ -70,16 +70,14 @@ void OnsiteProj<OperatorPW<T, Device>>::init(const int ik_in)
 // this function sum up each non-local pseudopotential located on each atom,
 //--------------------------------------------------------------------------
 template<typename T, typename Device>
-void OnsiteProj<OperatorPW<T, Device>>::add_onsite_proj(T *hpsi_in, const int npol, const int m) const
+void OnsiteProj<OperatorPW<T, Device>>::add_onsite_proj(T *hpsi_in, const int npol, const int m, const int npwx) const
 {
     ModuleBase::timer::start("OnsiteProj", "add_onsite_proj");
 
     auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
-    // apply the operator to the wavefunction
-    //std::cout << "use of tab_atomic at " << __FILE__ << ": " << __LINE__ << std::endl;
     const std::complex<double>* tab_atomic = onsite_p->get_tab_atomic();
     const int npw = onsite_p->get_npw();
-    const int npwx = onsite_p->get_npwx();
+    // npwx passed as parameter
     char transa = 'N';
     char transb = 'T';
     int npm = m;
@@ -102,12 +100,10 @@ void OnsiteProj<OperatorPW<T, Device>>::add_onsite_proj(T *hpsi_in, const int np
 }
 
 template<typename T, typename Device>
-void OnsiteProj<OperatorPW<T, Device>>::update_becp(const T *psi_in, const int npol, const int m) const
+void OnsiteProj<OperatorPW<T, Device>>::update_becp(const T *psi_in, const int npol, const int m, const int npwx) const
 {
     auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
-    // calculate <alpha|psi> 
-    // std::cout << __FILE__ << ":" << __LINE__ << " nbands = " << m << std::endl;
-    onsite_p->overlap_proj_psi(m, psi_in);
+    onsite_p->overlap_proj_psi(m, psi_in, npwx);
 }
 
 template<typename T, typename Device>
@@ -150,12 +146,27 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_delta_spin(const int npol, const
 
     // prepare array of nh_iat and lambda_array to pass to the onsite_ps_op operator
     std::vector<std::complex<double>> tmp_lambda_coeff(this->ucell->nat * 4);
-    for(int iat=0;iat<this->ucell->nat;iat++)
+    if (npol == 1)
+    {
+        int spin_sign = 1;
+        if (PARAM.inp.nspin == 2)
+        {
+            spin_sign = (this->isk[this->ik] == 0) ? 1 : -1;
+        }
+        for(int iat=0;iat<this->ucell->nat;iat++)
+        {
+            tmp_lambda_coeff[iat] = std::complex<double>(lambda[iat][2] * spin_sign, 0.0);
+        }
+    }
+    else
     {
-        tmp_lambda_coeff[iat * 4] = std::complex<double>(lambda[iat][2], 0.0);
-        tmp_lambda_coeff[iat * 4 + 1] = std::complex<double>(lambda[iat][0], lambda[iat][1]);
-        tmp_lambda_coeff[iat * 4 + 2] = std::complex<double>(lambda[iat][0], -1 * lambda[iat][1]);
-        tmp_lambda_coeff[iat * 4 + 3] = std::complex<double>(-1 * lambda[iat][2], 0.0);
+        for(int iat=0;iat<this->ucell->nat;iat++)
+        {
+            tmp_lambda_coeff[iat * 4] = std::complex<double>(lambda[iat][2], 0.0);
+            tmp_lambda_coeff[iat * 4 + 1] = std::complex<double>(lambda[iat][0], lambda[iat][1]);
+            tmp_lambda_coeff[iat * 4 + 2] = std::complex<double>(lambda[iat][0], -1 * lambda[iat][1]);
+            tmp_lambda_coeff[iat * 4 + 3] = std::complex<double>(-1 * lambda[iat][2], 0.0);
+        }
     }
     syncmem_complex_h2d_op()(this->lambda_coeff, tmp_lambda_coeff.data(), this->ucell->nat * 4);
     // TODO: code block above should be moved to the init function
@@ -168,46 +179,88 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_delta_spin(const int npol, const
         tnp,  
         this->lambda_coeff,
         this->ps, becp);
+}
 
-    /*int sum = 0;
-    if (npol == 1)
-    {
-        const int current_spin = this->isk[this->ik];
-    }
-    else
+// cal_ps_dftu — compute ps = VU * becp for DFT+U Hamiltonian contribution
+//
+// eff_pot_pw layout by nspin:
+//   nspin=1: [iat0_tlp1^2 | iat1_tlp1^2 | ...]
+//            single spin channel, full array uploaded
+//   nspin=2: [iat0_up | iat1_up | ... | iat0_dn | iat1_dn | ...]
+//            split layout — first half is spin-up, second half spin-down.
+//            For isk==1 (spin-down k-point), only the second half is
+//            uploaded to vu_device so that vu_begin_iat[iat] indexes
+//            correctly into the spin-down block.
+//   nspin=4: [iat0_Pauli_4blocks | iat1_Pauli_4blocks | ...]
+//            4*(2l+1)^2 entries per atom; kernel uses npol=2 spinor
+//            structure with 2x2 Pauli matrix coefficients.
+//
+// vu_begin_iat is computed as tlp1^2 * npol^2 per atom at init time,
+// which gives the correct offset for each nspin case:
+//   nspin=1: tlp1^2 * 1 = tlp1^2
+//   nspin=2: tlp1^2 * 1 = tlp1^2 (per spin channel, selected by isk)
+//   nspin=4: tlp1^2 * 4 = (2*tlp1)^2
+template<typename T, typename Device>
+void OnsiteProj<OperatorPW<T, Device>>::setup_pw_dftu_indices() const
+{
+    this->init_dftu = true;
+    auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+    const int npol = this->ucell->get_npol();
+
+    resmem_int_op()(this->orb_l_iat, this->ucell->nat);
+    resmem_int_op()(this->ip_m, onsite_p->get_tot_nproj());
+    resmem_int_op()(this->vu_begin_iat, this->ucell->nat);
+    resmem_int_op()(this->ip_iat, onsite_p->get_tot_nproj());
+
+    std::vector<int> ip_iat0(onsite_p->get_tot_nproj());
+    std::vector<int> ip_m0(onsite_p->get_tot_nproj());
+    std::vector<int> vu_begin_iat0(this->ucell->nat);
+    std::vector<int> orb_l_iat0(this->ucell->nat);
+    int ip0 = 0;
+    int vu_begin = 0;
+    for(int iat=0;iat<this->ucell->nat;iat++)
     {
-        for (int iat = 0; iat < this->ucell->nat; iat++)
+        const int it = this->ucell->iat2it[iat];
+        const int target_l = this->dftu->get_orbital_corr(it);
+        orb_l_iat0[iat] = target_l;
+        const int nproj = onsite_p->get_nh(iat);
+        if(target_l == -1)
         {
-            const int nproj = onsite_p->get_nh(iat);
-            if(constrain[iat].x == 0 && constrain[iat].y == 0 && constrain[iat].z == 0)
+            for(int ip=0;ip<nproj;ip++)
             {
-                sum += nproj;
-                continue;
+                ip_iat0[ip0] = iat;
+                ip_m0[ip0++] = -1;
             }
-            const std::complex<double> coefficients0(lambda[iat][2], 0.0);
-            const std::complex<double> coefficients1(lambda[iat][0] , lambda[iat][1]);
-            const std::complex<double> coefficients2(lambda[iat][0] , -1 * lambda[iat][1]);
-            const std::complex<double> coefficients3(-1 * lambda[iat][2], 0.0);
-            // each atom has nproj, means this is with structure factor;
-            // each projector (each atom) must multiply coefficient
-            // with all the other projectors.
-            for (int ib = 0; ib < m; ib+=2)
+            vu_begin_iat0[iat] = 0;
+            continue;
+        }
+        else
+        {
+            const int tlp1 = 2 * target_l + 1;
+            vu_begin_iat0[iat] = vu_begin;
+            vu_begin += tlp1 * tlp1 * npol * npol;
+            const int m_begin = target_l * target_l;
+            const int m_end  = (target_l + 1) * (target_l + 1);
+            for(int ip=0;ip<nproj;ip++)
             {
-                for (int ip = 0; ip < nproj; ip++)
+                ip_iat0[ip0] = iat;
+                if(ip >= m_begin && ip < m_end)
+                {
+                    ip_m0[ip0++] = ip - m_begin;
+                }
+                else
                 {
-                    const int psind = (sum + ip) * m + ib;
-                    const int becpind = ib * tnp + sum + ip;
-                    const std::complex<double> becp1 = becp[becpind];
-                    const std::complex<double> becp2 = becp[becpind + tnp];
-                    ps[psind] += coefficients0 * becp1
-                                    + coefficients2 * becp2;
-                    ps[psind + 1] += coefficients1 * becp1
-                                        + coefficients3 * becp2;
-                } // end ip
-            } // end ib
-            sum += nproj;
-        } // end iat
-    }*/
+                    ip_m0[ip0++] = -1;
+                }
+            }
+        }
+    }
+    syncmem_int_h2d_op()(this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat);
+    syncmem_int_h2d_op()(this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
+    syncmem_int_h2d_op()(this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj());
+    syncmem_int_h2d_op()(this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat);
+
+    resmem_complex_op()(this->vu_device, dftu->get_size_eff_pot_pw());
 }
 
 template<typename T, typename Device>
@@ -223,8 +276,6 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_dftu(
     auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
     const std::complex<double>* becp = onsite_p->get_becp();
 
-    // T *ps = new T[tnp * m];
-    // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp);
     if (this->nkb_m < m * tnp) {
         resmem_complex_op()(this->ps, tnp * m, "OnsiteProj<PW>::ps");
         this->nkb_m = m * tnp;
@@ -236,140 +287,40 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_dftu(
 
     if(!this->init_dftu)
     {
-        this->init_dftu = true;
-        //prepare orb_l_iat, ip_m, vu_begin_iat and vu_device
-        resmem_int_op()(this->orb_l_iat, this->ucell->nat);
-        resmem_int_op()(this->ip_m, onsite_p->get_tot_nproj());
-        resmem_int_op()(this->vu_begin_iat, this->ucell->nat);
-        // recal the ip_iat
-        resmem_int_op()(this->ip_iat, onsite_p->get_tot_nproj());
-        std::vector<int> ip_iat0(onsite_p->get_tot_nproj());
-        std::vector<int> ip_m0(onsite_p->get_tot_nproj());
-        std::vector<int> vu_begin_iat0(this->ucell->nat);
-        std::vector<int> orb_l_iat0(this->ucell->nat);
-        int ip0 = 0;
-        int vu_begin = 0;
-        for(int iat=0;iat<this->ucell->nat;iat++)
-        {
-            const int it = this->ucell->iat2it[iat];
-            const int target_l = this->dftu->orbital_corr[it];
-            orb_l_iat0[iat] = target_l;
-            const int nproj = onsite_p->get_nh(iat);
-            if(target_l == -1)
-            {
-                for(int ip=0;ip<nproj;ip++)
-                {
-                    ip_iat0[ip0] = iat;
-                    ip_m0[ip0++] = -1;
-                }
-                vu_begin_iat0[iat] = 0;
-                continue;
-            }
-            else
-            {
-                const int tlp1 = 2 * target_l + 1;
-                vu_begin_iat0[iat] = vu_begin;
-                vu_begin += tlp1 * tlp1 * 4;
-                const int m_begin = target_l * target_l;
-                const int m_end  = (target_l + 1) * (target_l + 1);
-                for(int ip=0;ip<nproj;ip++)
-                {
-                    ip_iat0[ip0] = iat;
-                    if(ip >= m_begin && ip < m_end)
-                    {
-                        ip_m0[ip0++] = ip - m_begin;
-                    }
-                    else
-                    {
-                        ip_m0[ip0++] = -1;
-                    }
-                }
-            }
-        }
-        syncmem_int_h2d_op()(this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat);
-        syncmem_int_h2d_op()(this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
-        syncmem_int_h2d_op()(this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj());
-        syncmem_int_h2d_op()(this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat);
-
-        resmem_complex_op()(this->vu_device, dftu->get_size_eff_pot_pw());
+        this->setup_pw_dftu_indices();
     }
 
-    syncmem_complex_h2d_op()(this->vu_device, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw());
-
+    const int isk_val = (PARAM.inp.nspin == 2) ? this->isk[this->ik] : 0;
+    const std::complex<double>* vu_host = dftu->get_eff_pot_pw_spin(isk_val);
+    const int vu_size = dftu->get_size_eff_pot_pw_spin();
+    syncmem_complex_h2d_op()(this->vu_device, vu_host, vu_size);
     hamilt::onsite_ps_op<Real, Device>()(
-        this->ctx,   // device context
-        m, 
+        this->ctx,
+        m,
         npol,
         this->orb_l_iat,
         this->ip_iat,
         this->ip_m,
-        this->vu_begin_iat, 
-        tnp,  
+        this->vu_begin_iat,
+        tnp,
         this->vu_device,
         this->ps, becp);
-
-    /*
-    int sum = 0;
-    if (npol == 1)
-    {
-        const int current_spin = this->isk[this->ik];
-    }
-    else
-    {
-        for (int iat = 0; iat < this->ucell->nat; iat++)
-        {
-            const int it = this->ucell->iat2it[iat];
-            const int target_l = dftu->orbital_corr[it];
-            const int nproj = onsite_p->get_nh(iat);
-            if(target_l == -1)
-            {
-                sum += nproj;
-                continue;
-            }
-            const int ip_begin = target_l * target_l;
-            const int ip_end = (target_l + 1) * (target_l + 1);
-            const int tlp1 = 2 * target_l + 1;
-            const int tlp1_2 = tlp1 * tlp1;
-            const std::complex<double>* vu = dftu->get_eff_pot_pw(iat);
-            // each projector (each atom) must multiply coefficient
-            // with all the other projectors.
-            for (int ib = 0; ib < m; ib+=2)
-            {
-                for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
-                {
-                    const int psind = (sum + ip2) * m + ib;
-                    const int m2 = ip2 - ip_begin;
-                    for (int ip1 = ip_begin; ip1 < ip_end; ip1++)
-                    {
-                        const int becpind1 = ib * tnp + sum + ip1;
-                        const int m1 = ip1 - ip_begin;
-                        const int index_mm = m1 * tlp1 + m2;
-                        const std::complex<double> becp1 = becp[becpind1];
-                        const std::complex<double> becp2 = becp[becpind1 + tnp];
-                        ps[psind] += vu[index_mm] * becp1
-                                    + vu[index_mm + tlp1_2 * 2] * becp2;
-                        ps[psind + 1] += vu[index_mm + tlp1_2 * 1] * becp1
-                                    + vu[index_mm + tlp1_2 * 3] * becp2;
-                    } // end ip1
-                } // end ip2
-            } // end ib
-            sum += nproj;
-        } // end iat
-    }*/
 }
 
 template<>
 void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_CPU>>::add_onsite_proj(
 		std::complex<float> *hpsi_in, 
 		const int npol, 
-		const int m) const
+		const int m,
+		const int npwx) const
 {}
 
 template<>
 void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_CPU>>::update_becp(
 		const std::complex<float> *psi_in, 
 		const int npol, 
-		const int m) const
+		const int m,
+		const int npwx) const
 {}
 
 template<>
@@ -389,14 +340,16 @@ template<>
 void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>::add_onsite_proj(
 		std::complex<float> *hpsi_in, 
 		const int npol, 
-		const int m) const
+		const int m,
+		const int npwx) const
 {}
 
 template<>
 void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>::update_becp(
 		const std::complex<float> *psi_in, 
 		const int npol, 
-		const int m) const
+		const int m,
+		const int npwx) const
 {}
 
 template<>
@@ -412,6 +365,21 @@ void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>::cal_p
 {}
 #endif
 
+// OnsiteProj::act — apply DFT+U and/or DeltaSpin Hamiltonian correction
+//
+// Leading dimension note:
+//   The Davidson/CG solver allocates psi and hpsi with stride ld_psi = ngk[ik]
+//   (the number of G-vectors for the current k-point), NOT npwx (the maximum
+//   across all k-points).  We must pass ld_psi = nbasis/npol through the
+//   GEMM chain to avoid buffer overflow when ngk[ik] < npwx.
+//
+// nspin handling in cal_ps_dftu:
+//   nspin=1 (npol=1): single spin channel, no spin selection needed
+//   nspin=2 (npol=1): eff_pot_pw uses split layout [all_up | all_dn];
+//     spin-up  k-points (isk=0) read from the first  half;
+//     spin-down k-points (isk=1) read from the second half.
+//   nspin=4 (npol=2): all 4 Pauli blocks stored per-atom; kernel uses
+//     2x2 spinor structure with tlp1_npol^2 entries per atom.
 template<typename T, typename Device>
 void OnsiteProj<OperatorPW<T, Device>>::act(
     const int nbands,
@@ -423,10 +391,11 @@ void OnsiteProj<OperatorPW<T, Device>>::act(
     const bool is_first_node)const
 {
     ModuleBase::timer::start("Operator", "OnsiteProjPW");
-    this->update_becp(tmpsi_in, npol, nbands);
+    const int ld_psi = nbasis / npol;
+    this->update_becp(tmpsi_in, npol, nbands, ld_psi);
     this->cal_ps_delta_spin(npol, nbands);
     this->cal_ps_dftu(npol, nbands);
-    this->add_onsite_proj(tmhpsi, npol, nbands);
+    this->add_onsite_proj(tmhpsi, npol, nbands, ld_psi);
     ModuleBase::timer::end("Operator", "OnsiteProjPW");
 }
 
diff --git a/source/source_pw/module_pwdft/op_pw_proj.h b/source/source_pw/module_pwdft/op_pw_proj.h
index 50207cc7b78..bd8044724da 100644
--- a/source/source_pw/module_pwdft/op_pw_proj.h
+++ b/source/source_pw/module_pwdft/op_pw_proj.h
@@ -54,9 +54,12 @@ class OnsiteProj<OperatorPW<T, Device>> : public OperatorPW<T, Device>
 
     void cal_ps_dftu(const int npol, const int m) const;
 
-    void update_becp(const T* psi_in, const int npol, const int m) const;
+    /// one-time setup of DFT+U PW index arrays (orb_l_iat, ip_iat, ip_m, vu_begin_iat)
+    void setup_pw_dftu_indices() const;
 
-    void add_onsite_proj(T *hpsi_in, const int npol, const int m) const;
+    void update_becp(const T* psi_in, const int npol, const int m, const int npwx) const;
+
+    void add_onsite_proj(T *hpsi_in, const int npol, const int m, const int npwx) const;
 
     const int* isk = nullptr;
 
diff --git a/source/source_pw/module_pwdft/setup_pot.cpp b/source/source_pw/module_pwdft/setup_pot.cpp
index 1073774b38c..c17515bf7d4 100644
--- a/source/source_pw/module_pwdft/setup_pot.cpp
+++ b/source/source_pw/module_pwdft/setup_pot.cpp
@@ -99,6 +99,7 @@ void pw::setup_pot(const int istep,
                    PARAM.inp.sccut,
                    PARAM.inp.sc_drop_thr,
                    ucell,
+                   PARAM.inp.sc_direction_only,
                    nullptr, // parallel orbitals
                    PARAM.inp.nspin,
                    kv,
diff --git a/source/source_pw/module_pwdft/stress_onsite.cpp b/source/source_pw/module_pwdft/stress_onsite.cpp
index 3e8fac403fa..8223be49ef4 100644
--- a/source/source_pw/module_pwdft/stress_onsite.cpp
+++ b/source/source_pw/module_pwdft/stress_onsite.cpp
@@ -99,18 +99,10 @@ void Stress_Func<FPTYPE, Device>::stress_onsite(
                 // Calculate dbecp_s = <psi|d(beta)/d(epsilon_ij)>
                 fs_tools->cal_dbecp_s(ik, num_occupied_bands, ipol, jpol);
                 
-                // Add DFT+U contribution if enabled
                 if (PARAM.inp.dft_plus_u)
                 {
-                    // Calculate DFT+U stress contribution
-                    double dftu_stress = fs_tools->cal_stress_dftu(
-                        ik,
-                        num_occupied_bands,
-                        dftu.orbital_corr.data(),
-                        dftu.get_eff_pot_pw(0),
-                        dftu.get_size_eff_pot_pw(),
-                        wg.c
-                    );
+                    double dftu_stress = onsite_projector->cal_stress_onsite_dftu(
+                        ik, num_occupied_bands, dftu, nks, wg.c);
                     
                     sigma_onsite[idx] += dftu_stress;
 #ifdef __DEBUG
@@ -118,23 +110,13 @@ void Stress_Func<FPTYPE, Device>::stress_onsite(
 #endif
                 }
                 
-                // Add spin constraint contribution if enabled
                 if (PARAM.inp.sc_mag_switch)
                 {
-                    // Get spin constraint instance
                     spinconstrain::SpinConstrain<std::complex<double>>& spin_constrain = 
                         spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
                     
-                    // Get lambda parameters
-                    const std::vector<ModuleBase::Vector3<double>>& lambda = spin_constrain.get_sc_lambda();
-                    
-                    // Calculate spin constraint stress contribution
-                    double dspin_stress = fs_tools->cal_stress_dspin(
-                        ik,
-                        num_occupied_bands,
-                        lambda.data(),
-                        wg.c
-                    );
+                    double dspin_stress = onsite_projector->cal_stress_onsite_dspin(
+                        ik, num_occupied_bands, spin_constrain.get_sc_lambda().data(), wg.c);
                     
                     sigma_onsite[idx] += dspin_stress;
                 }
diff --git a/tests/01_PW/020_PW_kspace/KPT b/tests/01_PW/020_PW_kspace/KPT
new file mode 100644
index 00000000000..4fd38968a05
--- /dev/null
+++ b/tests/01_PW/020_PW_kspace/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 1 0 0 0
diff --git a/tests/01_PW/021_PW_kspace3/KPT b/tests/01_PW/021_PW_kspace3/KPT
new file mode 100644
index 00000000000..29f3ef8bdf3
--- /dev/null
+++ b/tests/01_PW/021_PW_kspace3/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+3 2 1 0 0 0
diff --git a/tests/01_PW/035_PW_15_SO/log_all_fix.txt b/tests/01_PW/035_PW_15_SO/log_all_fix.txt
new file mode 100644
index 00000000000..0c68c0f61e0
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_all_fix.txt
@@ -0,0 +1,114 @@
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 10:30:11 2026
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(1.0224e-05 SEC) : SETUP UNITCELL
+ DONE(0.00217029 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0227723  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.121629   SEC) : LOCAL POTENTIAL
+ DONE(0.156523   SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.156625   SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.680372   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01  10.16
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   4.09
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   2.01
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   2.88
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   3.77
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   1.08
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   0.27
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256808e-03   1.3430e-04   0.08
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206437e-04   5.3510e-05   0.19
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483838e-04   2.8637e-05   0.10
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.07
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851807e-05   3.3868e-06   0.08
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607677e-06   2.4541e-06   0.07
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461071e-05   3.5635e-07   0.09
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10683.9706741759      -396.2387945264       396.2241082742 
+       -396.2387945264    -10683.9707016515       396.2241283692 
+        396.2241082742       396.2241283692    -10626.8786336910 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10664.940003 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      25.68  1        25.68  100.00 
+                   total             25.66  14       1.83   99.93  
+ PW_Basis_Sup      recip2real        0.30   250      0.00   1.17   
+ Relax_Driver      relax_driver      25.50  1        25.50  99.31  
+ ESolver_KS        runner            25.48  1        25.48  99.23  
+ ESolver_KS_PW     before_scf        0.52   1        0.52   2.04   
+ Potential         cal_veff          0.57   15       0.04   2.22   
+ PW_Basis_Sup      real2recip        0.39   289      0.00   1.53   
+ PotXC             cal_veff          0.51   15       0.03   1.98   
+ XC_Functional     v_xc              0.51   15       0.03   1.97   
+ PSIPrepare        initialize_psi    0.44   1        0.44   1.71   
+ psi_init          random_t          0.44   2        0.22   1.70   
+ psi_init          stick_to_pool     0.28   27664    0.00   1.08   
+ ESolver_KS_PW     hamilt2rho_single 24.24  14       1.73   94.39  
+ HSolverPW         solve             24.24  14       1.73   94.39  
+ HSolverPW         solve_psik        21.31  28       0.76   82.97  
+ Diago_DavSubspace diag_once         21.21  28       0.76   82.61  
+ Diago_DavSubspace first             5.13   28       0.18   19.99  
+ Operator          hPsi              17.46  110      0.16   67.99  
+ Operator          veff_pw           17.11  110      0.16   66.62  
+ PW_Basis_K        recip2real        11.21  8480     0.00   43.64  
+ PW_Basis_K        real2recip        8.70   6352     0.00   33.87  
+ Operator          nonlocal_pw       0.34   110      0.00   1.34   
+ Diago_DavSubspace cal_elem          0.40   110      0.00   1.57   
+ Diago_DavSubspace cal_grad          15.50  82       0.19   60.36  
+ ElecStatePW       psiToRho          2.88   14       0.21   11.20  
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 10:30:11 2026
+ FINISH Time  : Sun May  3 10:30:40 2026
+ TOTAL  Time  : 29
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/log_dev_fresh.txt b/tests/01_PW/035_PW_15_SO/log_dev_fresh.txt
new file mode 100644
index 00000000000..3ea86664e9d
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_dev_fresh.txt
@@ -0,0 +1,116 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 0f9d7d97e (Thu Apr 30 12:48:20 2026 +0800)
+
+ Sun May  3 10:26:48 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(0.0236263  SEC) : SETUP UNITCELL
+ DONE(0.0258316  SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0370996  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0492078  SEC) : LOCAL POTENTIAL
+ DONE(0.0792156  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.0792726  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.131711   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01   0.62
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.15
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   0.11
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   0.09
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   0.13
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   0.07
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   0.14
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256808e-03   1.3430e-04   0.07
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206437e-04   5.3510e-05   0.10
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483839e-04   2.8637e-05   0.06
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.10
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851811e-05   3.3868e-06   0.09
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607600e-06   2.4541e-06   0.12
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461075e-05   3.5635e-07   0.19
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10683.9706741759      -396.2387945264       396.2241082742 
+       -396.2387945264    -10683.9707016515       396.2241283692 
+        396.2241082742       396.2241283692    -10626.8786336910 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10664.940003 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+                   total             2.16   15       0.14   100.00 
+ Driver            atomic_world      2.16   1        2.16   100.00 
+ PW_Basis_Sup      recip2real        0.04   250      0.00   1.75   
+ ppcell_vnl        init_vnl          0.03   1        0.03   1.18   
+ Relax_Driver      relax_driver      2.08   1        2.08   96.25  
+ ESolver_KS        runner            2.08   1        2.08   95.92  
+ ESolver_KS_PW     before_scf        0.05   1        0.05   2.42   
+ H_Ewald_pw        compute_ewald     0.02   1        0.02   1.14   
+ Potential         cal_veff          0.08   15       0.01   3.87   
+ PW_Basis_Sup      real2recip        0.05   289      0.00   2.43   
+ PotXC             cal_veff          0.08   15       0.01   3.66   
+ XC_Functional     v_xc              0.08   15       0.01   3.65   
+ ESolver_KS_PW     hamilt2rho_single 1.91   14       0.14   88.35  
+ HSolverPW         solve             1.91   14       0.14   88.33  
+ HSolverPW         solve_psik        1.72   28       0.06   79.32  
+ Diago_DavSubspace diag_once         1.69   28       0.06   78.13  
+ Diago_DavSubspace first             0.50   28       0.02   23.31  
+ Operator          hPsi              1.28   110      0.01   59.26  
+ Operator          veff_pw           1.22   110      0.01   56.38  
+ PW_Basis_K        recip2real        0.76   8480     0.00   35.27  
+ PW_Basis_K        real2recip        0.61   6352     0.00   28.15  
+ Operator          nonlocal_pw       0.06   110      0.00   2.81   
+ Nonlocal          add_nonlocal_pp   0.03   110      0.00   1.18   
+ Diago_DavSubspace cal_elem          0.06   110      0.00   2.88   
+ Diago_DavSubspace diag_zhegvx       0.16   110      0.00   7.35   
+ Diago_DavSubspace cal_grad          0.98   82       0.01   45.24  
+ Diago_DavSubspace last              0.03   73       0.00   1.44   
+ ElecStatePW       psiToRho          0.18   14       0.01   8.31   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 10:26:48 2026
+ FINISH Time  : Sun May  3 10:26:50 2026
+ TOTAL  Time  : 2
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/log_dev_np4.txt b/tests/01_PW/035_PW_15_SO/log_dev_np4.txt
new file mode 100644
index 00000000000..1dfcab69834
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_dev_np4.txt
@@ -0,0 +1,116 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 0f9d7d97e (Thu Apr 30 12:48:20 2026 +0800)
+
+ Sun May  3 09:53:39 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(0.0332596  SEC) : SETUP UNITCELL
+ DONE(0.0366598  SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0414821  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0673018  SEC) : LOCAL POTENTIAL
+ DONE(0.102441   SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.102543   SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.20761    SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01   0.89
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.18
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   0.09
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   0.06
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   0.15
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   0.12
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   1.30
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256808e-03   1.3430e-04   0.21
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206436e-04   5.3510e-05   0.30
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483839e-04   2.8637e-05   0.10
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.10
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851807e-05   3.3868e-06   0.12
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607658e-06   2.4541e-06   0.13
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461078e-05   3.5635e-07   0.06
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10683.9706741759      -396.2387945264       396.2241082742 
+       -396.2387945264    -10683.9707016515       396.2241283692 
+        396.2241082742       396.2241283692    -10626.8786336910 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10664.940003 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+                   total             4.03   15       0.27   100.00 
+ Driver            atomic_world      4.03   1        4.03   100.00 
+ PW_Basis_Sup      recip2real        0.07   250      0.00   1.78   
+ Relax_Driver      relax_driver      3.92   1        3.92   97.40  
+ ESolver_KS        runner            3.91   1        3.91   97.14  
+ ESolver_KS_PW     before_scf        0.10   1        0.10   2.60   
+ Potential         cal_veff          0.10   15       0.01   2.39   
+ PW_Basis_Sup      real2recip        0.07   289      0.00   1.82   
+ PotXC             cal_veff          0.08   15       0.01   2.03   
+ XC_Functional     v_xc              0.08   15       0.01   2.03   
+ PSIPrepare        initialize_psi    0.09   1        0.09   2.33   
+ psi_init          random_t          0.09   2        0.05   2.28   
+ psi_init          stick_to_pool     0.06   27664    0.00   1.44   
+ ESolver_KS_PW     hamilt2rho_single 3.61   14       0.26   89.64  
+ HSolverPW         solve             3.61   14       0.26   89.63  
+ HSolverPW         solve_psik        3.18   28       0.11   78.99  
+ Diago_DavSubspace diag_once         3.15   28       0.11   78.27  
+ Diago_DavSubspace first             0.56   28       0.02   13.87  
+ Operator          hPsi              2.63   110      0.02   65.34  
+ Operator          veff_pw           2.55   110      0.02   63.25  
+ PW_Basis_K        recip2real        1.59   8480     0.00   39.38  
+ PW_Basis_K        real2recip        1.34   6352     0.00   33.32  
+ Operator          nonlocal_pw       0.08   110      0.00   2.01   
+ Diago_DavSubspace cal_elem          0.11   110      0.00   2.74   
+ Diago_DavSubspace diag_zhegvx       0.18   110      0.00   4.35   
+ Diago_DavSubspace cal_grad          2.32   82       0.03   57.68  
+ ElecStatePW       psiToRho          0.42   14       0.03   10.33  
+ Charge_Mixing     get_drho          0.05   14       0.00   1.25   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 09:53:39 2026
+ FINISH Time  : Sun May  3 09:53:43 2026
+ TOTAL  Time  : 4
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/log_dev_v2.txt b/tests/01_PW/035_PW_15_SO/log_dev_v2.txt
new file mode 100644
index 00000000000..2f11684fb3e
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_dev_v2.txt
@@ -0,0 +1,116 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 0f9d7d97e (Thu Apr 30 12:48:20 2026 +0800)
+
+ Sun May  3 11:36:34 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(0.030315   SEC) : SETUP UNITCELL
+ DONE(0.0305225  SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0370012  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0565552  SEC) : LOCAL POTENTIAL
+ DONE(0.0896376  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.0897285  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.275543   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01   1.19
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.33
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   0.10
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   0.10
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   0.12
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   0.11
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   0.39
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256807e-03   1.3430e-04   0.16
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206437e-04   5.3510e-05   0.22
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483838e-04   2.8637e-05   0.11
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.10
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851807e-05   3.3868e-06   0.11
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607697e-06   2.4541e-06   0.19
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461071e-05   3.5635e-07   0.13
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10683.9706741759      -396.2387945264       396.2241082742 
+       -396.2387945264    -10683.9707016515       396.2241283692 
+        396.2241082742       396.2241283692    -10626.8786336910 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10664.940003 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+                   total             3.68   15       0.25   100.00 
+ Driver            atomic_world      3.68   1        3.68   100.00 
+ PW_Basis_Sup      recip2real        0.06   250      0.00   1.73   
+ Relax_Driver      relax_driver      3.58   1        3.58   97.51  
+ ESolver_KS        runner            3.55   1        3.55   96.68  
+ ESolver_KS_PW     before_scf        0.19   1        0.19   5.05   
+ Potential         cal_veff          0.11   15       0.01   2.87   
+ PW_Basis_Sup      real2recip        0.05   289      0.00   1.49   
+ PotXC             cal_veff          0.09   15       0.01   2.54   
+ XC_Functional     v_xc              0.09   15       0.01   2.53   
+ PSIPrepare        initialize_psi    0.17   1        0.17   4.64   
+ psi_init          random_t          0.17   2        0.09   4.63   
+ psi_init          stick_to_pool     0.11   27664    0.00   3.01   
+ ESolver_KS_PW     hamilt2rho_single 3.21   14       0.23   87.32  
+ HSolverPW         solve             3.21   14       0.23   87.32  
+ HSolverPW         solve_psik        2.83   28       0.10   77.05  
+ Diago_DavSubspace diag_once         2.79   28       0.10   75.89  
+ Diago_DavSubspace first             0.88   28       0.03   24.05  
+ Operator          hPsi              2.30   110      0.02   62.47  
+ Operator          veff_pw           2.22   110      0.02   60.43  
+ PW_Basis_K        recip2real        1.37   8480     0.00   37.39  
+ PW_Basis_K        real2recip        1.17   6352     0.00   31.87  
+ Operator          nonlocal_pw       0.07   110      0.00   1.99   
+ Diago_DavSubspace cal_elem          0.07   110      0.00   1.88   
+ Diago_DavSubspace diag_zhegvx       0.18   110      0.00   4.81   
+ Diago_DavSubspace cal_grad          1.66   82       0.02   45.15  
+ Diago_DavSubspace last              0.04   73       0.00   1.22   
+ ElecStatePW       psiToRho          0.36   14       0.03   9.78   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 11:36:34 2026
+ FINISH Time  : Sun May  3 11:36:38 2026
+ TOTAL  Time  : 4
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/log_final.txt b/tests/01_PW/035_PW_15_SO/log_final.txt
new file mode 100644
index 00000000000..670673e6b62
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_final.txt
@@ -0,0 +1,61 @@
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 11:41:06 2026
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(1.433e-05  SEC) : SETUP UNITCELL
+ DONE(0.00395945 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0470998  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.238311   SEC) : LOCAL POTENTIAL
+ DONE(0.305711   SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.305784   SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(4.74629    SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01 119.68
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00  30.44
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00  24.67
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02  22.22
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02  27.82
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03  28.66
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04  16.99
diff --git a/tests/01_PW/035_PW_15_SO/log_pr_correct.txt b/tests/01_PW/035_PW_15_SO/log_pr_correct.txt
new file mode 100644
index 00000000000..0b1a32515e5
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_pr_correct.txt
@@ -0,0 +1,56 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 11:32:46 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(9.947e-06  SEC) : SETUP UNITCELL
+ DONE(0.00671472 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0815573  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.430605   SEC) : LOCAL POTENTIAL
+ DONE(0.479579   SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.479772   SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(3.3452     SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01 106.62
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.57
diff --git a/tests/01_PW/035_PW_15_SO/log_pr_fixed.txt b/tests/01_PW/035_PW_15_SO/log_pr_fixed.txt
new file mode 100644
index 00000000000..39f8bd62865
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_pr_fixed.txt
@@ -0,0 +1,118 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 09:57:21 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(1.2009e-05 SEC) : SETUP UNITCELL
+ DONE(0.00110645 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0045507  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0123942  SEC) : LOCAL POTENTIAL
+ DONE(0.0494793  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.0495481  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.339499   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01   0.85
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.25
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   0.10
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   0.10
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   0.11
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   0.10
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   0.12
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256808e-03   1.3430e-04   0.13
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206436e-04   5.3510e-05   0.13
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483838e-04   2.8637e-05   0.10
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.13
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851806e-05   3.3868e-06   0.15
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607677e-06   2.4541e-06   0.12
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461075e-05   3.5635e-07   0.13
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10677.0852150830      -396.2017451132       396.2491608088 
+       -396.2017451132    -10680.4013171834       396.1655869911 
+        396.2491608088       396.1655869911    -10619.9881143378 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10659.158216 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      2.90   1        2.90   100.00 
+                   total             2.88   14       0.21   99.31  
+ PW_Basis_Sup      recip2real        0.10   250      0.00   3.52   
+ Relax_Driver      relax_driver      2.83   1        2.83   97.56  
+ ESolver_KS        runner            2.81   1        2.81   96.80  
+ ESolver_KS_PW     before_scf        0.29   1        0.29   9.98   
+ Potential         init_pot          0.12   1        0.12   3.98   
+ Potential         cal_veff          0.20   15       0.01   6.84   
+ PW_Basis_Sup      real2recip        0.10   289      0.00   3.58   
+ PotXC             cal_veff          0.18   15       0.01   6.10   
+ XC_Functional     v_xc              0.18   15       0.01   6.09   
+ PSIPrepare        initialize_psi    0.17   1        0.17   5.97   
+ psi_init          random_t          0.17   2        0.08   5.85   
+ psi_init          stick_to_pool     0.11   27664    0.00   3.88   
+ ESolver_KS_PW     hamilt2rho_single 2.37   14       0.17   81.73  
+ HSolverPW         solve             2.37   14       0.17   81.73  
+ HSolverPW         solve_psik        2.00   28       0.07   68.94  
+ Diago_DavSubspace diag_once         1.98   28       0.07   68.07  
+ Diago_DavSubspace first             0.54   28       0.02   18.63  
+ Operator          hPsi              1.52   110      0.01   52.20  
+ Operator          veff_pw           1.45   110      0.01   49.77  
+ PW_Basis_K        recip2real        1.02   8480     0.00   35.19  
+ PW_Basis_K        real2recip        0.73   6352     0.00   25.22  
+ Operator          nonlocal_pw       0.07   110      0.00   2.37   
+ Nonlocal          add_nonlocal_pp   0.03   110      0.00   1.07   
+ Diago_DavSubspace cal_elem          0.06   110      0.00   2.10   
+ Diago_DavSubspace diag_zhegvx       0.17   110      0.00   5.96   
+ Diago_DavSubspace cal_grad          1.19   82       0.01   41.08  
+ Diago_DavSubspace last              0.05   73       0.00   1.65   
+ ElecStatePW       psiToRho          0.34   14       0.02   11.82  
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 09:57:21 2026
+ FINISH Time  : Sun May  3 09:57:27 2026
+ TOTAL  Time  : 6
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/log_pr_fresh.txt b/tests/01_PW/035_PW_15_SO/log_pr_fresh.txt
new file mode 100644
index 00000000000..b7de0605028
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_pr_fresh.txt
@@ -0,0 +1,115 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 10:26:50 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(1.0535e-05 SEC) : SETUP UNITCELL
+ DONE(0.00131561 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0131958  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0224664  SEC) : LOCAL POTENTIAL
+ DONE(0.0563405  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.0564181  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.0978204  SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01   0.64
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.12
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   0.09
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   0.18
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   0.33
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   0.24
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   0.13
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256807e-03   1.3430e-04   0.11
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206437e-04   5.3510e-05   0.15
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483838e-04   2.8637e-05   0.07
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.15
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851807e-05   3.3868e-06   0.09
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607677e-06   2.4541e-06   0.09
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461073e-05   3.5635e-07   0.14
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10677.0852150830      -396.2017451132       396.2491608088 
+       -396.2017451132    -10680.4013171834       396.1655869911 
+        396.2491608088       396.1655869911    -10619.9881143378 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10659.158216 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      2.68   1        2.68   100.00 
+                   total             2.66   14       0.19   98.99  
+ PW_Basis_Sup      recip2real        0.05   250      0.00   1.86   
+ ppcell_vnl        init_vnl          0.03   1        0.03   1.11   
+ Relax_Driver      relax_driver      2.60   1        2.60   96.82  
+ ESolver_KS        runner            2.58   1        2.58   96.13  
+ ESolver_KS_PW     before_scf        0.04   1        0.04   1.54   
+ Potential         cal_veff          0.09   15       0.01   3.35   
+ PW_Basis_Sup      real2recip        0.05   289      0.00   1.94   
+ PotXC             cal_veff          0.08   15       0.01   3.03   
+ XC_Functional     v_xc              0.08   15       0.01   3.01   
+ PSIPrepare        initialize_psi    0.04   1        0.04   1.40   
+ psi_init          random_t          0.04   2        0.02   1.39   
+ ESolver_KS_PW     hamilt2rho_single 2.40   14       0.17   89.52  
+ HSolverPW         solve             2.40   14       0.17   89.52  
+ HSolverPW         solve_psik        2.18   28       0.08   81.22  
+ Diago_DavSubspace diag_once         2.14   28       0.08   79.91  
+ Diago_DavSubspace first             0.63   28       0.02   23.36  
+ Operator          hPsi              1.62   110      0.01   60.26  
+ Operator          veff_pw           1.56   110      0.01   58.04  
+ PW_Basis_K        recip2real        0.88   8480     0.00   32.85  
+ PW_Basis_K        real2recip        0.85   6352     0.00   31.71  
+ Operator          nonlocal_pw       0.06   110      0.00   2.15   
+ Diago_DavSubspace cal_elem          0.08   110      0.00   2.86   
+ Diago_DavSubspace diag_zhegvx       0.16   110      0.00   5.91   
+ Diago_DavSubspace cal_grad          1.30   82       0.02   48.42  
+ ElecStatePW       psiToRho          0.21   14       0.01   7.65   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 10:26:50 2026
+ FINISH Time  : Sun May  3 10:26:53 2026
+ TOTAL  Time  : 3
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/log_pr_np4.txt b/tests/01_PW/035_PW_15_SO/log_pr_np4.txt
new file mode 100644
index 00000000000..9c5a7e6b7eb
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_pr_np4.txt
@@ -0,0 +1,117 @@
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 55690612c (Sat May 2 13:10:55 2026 +0800)
+
+ Sun May  3 09:54:29 2026
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(9.938e-06  SEC) : SETUP UNITCELL
+ DONE(0.00258591 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.00815335 SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0209517  SEC) : LOCAL POTENTIAL
+ DONE(0.0517217  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.0518776  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.142905   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01   0.80
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.19
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   0.09
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   0.11
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   0.19
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   0.11
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   0.17
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256808e-03   1.3430e-04   0.11
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206437e-04   5.3510e-05   0.08
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483838e-04   2.8637e-05   0.12
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.10
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851813e-05   3.3868e-06   0.13
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607619e-06   2.4541e-06   0.33
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461073e-05   3.5635e-07   0.30
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10677.0852150830      -396.2017451132       396.2491608088 
+       -396.2017451132    -10680.4013171834       396.1655869911 
+        396.2491608088       396.1655869911    -10619.9881143378 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10659.158216 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      3.06   1        3.06   100.00 
+                   total             3.03   14       0.22   99.14  
+ PW_Basis_Sup      recip2real        0.09   250      0.00   3.07   
+ Relax_Driver      relax_driver      2.98   1        2.98   97.39  
+ ESolver_KS        runner            2.95   1        2.95   96.49  
+ ESolver_KS_PW     before_scf        0.09   1        0.09   2.97   
+ Potential         cal_veff          0.16   15       0.01   5.10   
+ PW_Basis_Sup      real2recip        0.08   289      0.00   2.72   
+ PotXC             cal_veff          0.14   15       0.01   4.57   
+ XC_Functional     v_xc              0.14   15       0.01   4.56   
+ PSIPrepare        initialize_psi    0.07   1        0.07   2.17   
+ psi_init          random_t          0.07   2        0.03   2.15   
+ psi_init          stick_to_pool     0.05   27664    0.00   1.59   
+ ESolver_KS_PW     hamilt2rho_single 2.64   14       0.19   86.51  
+ HSolverPW         solve             2.64   14       0.19   86.50  
+ HSolverPW         solve_psik        2.35   28       0.08   77.01  
+ Diago_DavSubspace diag_once         2.33   28       0.08   76.31  
+ Diago_DavSubspace first             0.66   28       0.02   21.68  
+ Operator          hPsi              1.83   110      0.02   59.89  
+ Operator          veff_pw           1.76   110      0.02   57.63  
+ PW_Basis_K        recip2real        1.12   8480     0.00   36.48  
+ PW_Basis_K        real2recip        0.88   6352     0.00   28.83  
+ Operator          nonlocal_pw       0.07   110      0.00   2.20   
+ Nonlocal          add_nonlocal_pp   0.03   110      0.00   1.00   
+ Diago_DavSubspace cal_elem          0.08   110      0.00   2.57   
+ Diago_DavSubspace diag_zhegvx       0.19   110      0.00   6.07   
+ Diago_DavSubspace cal_grad          1.42   82       0.02   46.56  
+ Diago_DavSubspace last              0.03   73       0.00   1.06   
+ ElecStatePW       psiToRho          0.27   14       0.02   8.68   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 09:54:29 2026
+ FINISH Time  : Sun May  3 09:54:32 2026
+ TOTAL  Time  : 3
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/log_v2.txt b/tests/01_PW/035_PW_15_SO/log_v2.txt
new file mode 100644
index 00000000000..d9a1d0acec2
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/log_v2.txt
@@ -0,0 +1,120 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 11:35:24 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 3 for Ga: [Ar] 3d10 4s2 4p1
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(8.644e-06  SEC) : SETUP UNITCELL
+ DONE(0.0042798  SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ As      1           
+ Ga      1           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0141793  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0260913  SEC) : LOCAL POTENTIAL
+ DONE(0.0581521  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.266724
+ DONE(0.05822    SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.11065    SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1      0.00e+00   0.00e+00   2.00e+00   2.00e+00  -1.59867930e+03   0.00000000e+00   3.1042e+01   0.79
+ DS2      0.00e+00   0.00e+00   9.75e-01   1.06e+00  -1.68133543e+03  -8.26561268e+01   3.8628e+00   0.20
+ DS3      0.00e+00   0.00e+00   8.68e-01   8.72e-01  -1.67677930e+03   4.55612625e+00   1.0730e+00   0.12
+ DS4      0.00e+00   0.00e+00   7.46e-01   7.69e-01  -1.67820852e+03  -1.42921557e+00   8.1469e-02   0.14
+ DS5      0.00e+00   0.00e+00   7.61e-01   7.70e-01  -1.67833326e+03  -1.24741925e-01   2.3457e-02   0.17
+ DS6      0.00e+00   0.00e+00   7.60e-01   7.69e-01  -1.67835572e+03  -2.24548962e-02   3.2082e-03   0.14
+ DS7      0.00e+00   0.00e+00   7.42e-01   7.50e-01  -1.67836230e+03  -6.58348573e-03   9.5446e-04   0.15
+ DS8      0.00e+00   0.00e+00   7.31e-01   7.38e-01  -1.67836427e+03  -1.97256807e-03   1.3430e-04   0.16
+ DS9      0.00e+00   0.00e+00   7.28e-01   7.35e-01  -1.67836476e+03  -4.90206436e-04   5.3510e-05   0.11
+ DS10     0.00e+00   0.00e+00   7.32e-01   7.40e-01  -1.67836488e+03  -1.12483839e-04   2.8637e-05   0.28
+ DS11     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836501e+03  -1.31704337e-04   1.1546e-05   0.12
+ DS12     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -6.86851807e-05   3.3868e-06   0.19
+ DS13     0.00e+00   0.00e+00   7.33e-01   7.41e-01  -1.67836508e+03  -9.16607619e-06   2.4541e-06   0.18
+ DS14     0.00e+00   0.00e+00   7.34e-01   7.41e-01  -1.67836510e+03  -1.34461080e-05   3.5635e-07   0.38
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -10677.0852150830      -396.2017451131       396.2491608088 
+       -396.2017451131    -10680.4013171834       396.1655869911 
+        396.2491608088       396.1655869911    -10619.9881143378 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -10659.158216 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      3.28   1        3.28   100.00 
+                   total             3.26   14       0.23   99.42  
+ PW_Basis_Sup      recip2real        0.09   250      0.00   2.79   
+ Relax_Driver      relax_driver      3.20   1        3.20   97.61  
+ ESolver_KS        runner            3.18   1        3.18   96.80  
+ ESolver_KS_PW     before_scf        0.05   1        0.05   1.59   
+ Potential         cal_veff          0.17   15       0.01   5.25   
+ PW_Basis_Sup      real2recip        0.10   289      0.00   3.15   
+ PotXC             cal_veff          0.15   15       0.01   4.52   
+ XC_Functional     v_xc              0.15   15       0.01   4.50   
+ PSIPrepare        initialize_psi    0.04   1        0.04   1.16   
+ psi_init          random_t          0.04   2        0.02   1.15   
+ ESolver_KS_PW     hamilt2rho_single 2.86   14       0.20   87.19  
+ HSolverPW         solve             2.86   14       0.20   87.18  
+ HSolverPW         solve_psik        2.54   28       0.09   77.57  
+ Diago_DavSubspace diag_once         2.49   28       0.09   75.99  
+ Diago_DavSubspace first             0.76   28       0.03   23.02  
+ Operator          hPsi              1.80   110      0.02   54.97  
+ Operator          veff_pw           1.72   110      0.02   52.44  
+ PW_Basis_K        recip2real        1.09   8480     0.00   33.16  
+ PW_Basis_K        real2recip        0.90   6352     0.00   27.42  
+ Operator          nonlocal_pw       0.08   110      0.00   2.48   
+ Nonlocal          add_nonlocal_pp   0.04   110      0.00   1.28   
+ Diago_DavSubspace cal_elem          0.09   110      0.00   2.70   
+ Diago_DavSubspace diag_zhegvx       0.20   110      0.00   6.11   
+ Diago_DavSubspace cal_grad          1.47   82       0.02   44.65  
+ Diago_DavSubspace last              0.04   73       0.00   1.12   
+ ElecStatePW       psiToRho          0.30   14       0.02   9.17   
+ Charge_Mixing     mix_rho           0.06   13       0.00   1.75   
+ Charge_Mixing     mix_rho_recip     0.06   13       0.00   1.71   
+ Broyden_Mixing    tem_cal_coef      0.04   13       0.00   1.14   
+ Charge_Mixing     recip_hartree     0.04   136      0.00   1.11   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 11:35:24 2026
+ FINISH Time  : Sun May  3 11:35:27 2026
+ TOTAL  Time  : 3
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/035_PW_15_SO/result_all_fix.out b/tests/01_PW/035_PW_15_SO/result_all_fix.out
new file mode 100644
index 00000000000..1b437968bef
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/result_all_fix.out
@@ -0,0 +1,5 @@
+etotref -1678.3650981686610066
+etotperatomref -839.1825490843
+totalforceref 1.740848
+totalstressref 34372.194072
+totaltimeref 25.68
diff --git a/tests/01_PW/035_PW_15_SO/result_dev_np4.out b/tests/01_PW/035_PW_15_SO/result_dev_np4.out
new file mode 100644
index 00000000000..a32b38e9299
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/result_dev_np4.out
@@ -0,0 +1,5 @@
+etotref -1678.3650981686614614
+etotperatomref -839.1825490843
+totalforceref 1.739332
+totalstressref 34372.194072
+totaltimeref 4.03
diff --git a/tests/01_PW/035_PW_15_SO/result_final.out b/tests/01_PW/035_PW_15_SO/result_final.out
new file mode 100644
index 00000000000..797117b6d0c
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/result_final.out
@@ -0,0 +1,5 @@
+etotref 
+etotperatomref 
+totalforceref 0.0
+totalstressref 0.0
+totaltimeref 
diff --git a/tests/01_PW/035_PW_15_SO/result_pr_fixed.out b/tests/01_PW/035_PW_15_SO/result_pr_fixed.out
new file mode 100644
index 00000000000..793630ed73c
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/result_pr_fixed.out
@@ -0,0 +1,5 @@
+etotref -1678.3650981686610066
+etotperatomref -839.1825490843
+totalforceref 1.740848
+totalstressref 34354.707632
+totaltimeref 2.90
diff --git a/tests/01_PW/035_PW_15_SO/result_pr_np4.out b/tests/01_PW/035_PW_15_SO/result_pr_np4.out
new file mode 100644
index 00000000000..41410ff42be
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/result_pr_np4.out
@@ -0,0 +1,5 @@
+etotref -1678.3650981686610066
+etotperatomref -839.1825490843
+totalforceref 1.740848
+totalstressref 34354.707632
+totaltimeref 3.06
diff --git a/tests/01_PW/035_PW_15_SO/result_v2.out b/tests/01_PW/035_PW_15_SO/result_v2.out
new file mode 100644
index 00000000000..446d7141fb3
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/result_v2.out
@@ -0,0 +1,5 @@
+etotref -1678.3650981686614614
+etotperatomref -839.1825490843
+totalforceref 1.740848
+totalstressref 34354.707632
+totaltimeref 3.28
diff --git a/tests/01_PW/035_PW_15_SO/result_v2_check.out b/tests/01_PW/035_PW_15_SO/result_v2_check.out
new file mode 100644
index 00000000000..0becf5e1a82
--- /dev/null
+++ b/tests/01_PW/035_PW_15_SO/result_v2_check.out
@@ -0,0 +1,5 @@
+etotref -1678.3650981686612340
+etotperatomref -839.1825490843
+totalforceref 1.739332
+totalstressref 34372.194072
+totaltimeref 3.68
diff --git a/tests/01_PW/099_PW_DJ_SO/log_dev_np1.txt b/tests/01_PW/099_PW_DJ_SO/log_dev_np1.txt
new file mode 100644
index 00000000000..b99d0cca01c
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/log_dev_np1.txt
@@ -0,0 +1,123 @@
+Info: Local MPI proc number: 1,OpenMP thread number: 1,Total thread number: 1,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 0f9d7d97e (Thu Apr 30 12:48:20 2026 +0800)
+
+ Sun May  3 09:53:18 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 8 for Fe: [Ar] 3d6 4s2
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(0.0392222  SEC) : SETUP UNITCELL
+ DONE(0.0393218  SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               1           1             1             
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       
+ Fe      2           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0422761  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0474001  SEC) : LOCAL POTENTIAL
+ DONE(0.0583962  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.361328
+ DONE(0.0934921  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.115037   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1     -3.50e-03  -3.47e-03  -3.47e-03   1.94e-01  -5.93364556e+03   0.00000000e+00   6.0771e+01   0.17
+ DS2     -1.98e-02  -1.99e-02  -1.99e-02   5.37e-02  -5.61422656e+03   3.19418997e+02   2.7921e+01   0.10
+ DS3      3.63e-01   3.63e-01   3.63e-01   6.30e-01  -5.66083219e+03  -4.66056224e+01   9.3630e-01   0.12
+ DS4      6.56e-01   6.55e-01   6.55e-01   1.13e+00  -5.66314277e+03  -2.31058782e+00   9.7970e-01   0.09
+ DS5      1.13e+00   1.13e+00   1.13e+00   1.96e+00  -5.66288810e+03   2.54671960e-01   8.4319e-01   0.09
+ DS6      1.53e+00   1.53e+00   1.53e+00   2.66e+00  -5.65330287e+03   9.58522743e+00   6.5627e-01   0.09
+ DS7      3.47e+00   3.46e+00   3.46e+00   6.00e+00  -5.66100286e+03  -7.69999107e+00   3.6125e-01   0.12
+ DS8      4.01e+00   3.99e+00   3.99e+00   6.93e+00  -5.66254900e+03  -1.54613591e+00   3.2290e-01   0.10
+ DS9      4.09e+00   4.06e+00   4.06e+00   7.05e+00  -5.66250300e+03   4.59968832e-02   2.6478e-01   0.09
+ DS10     4.08e+00   4.04e+00   4.04e+00   7.03e+00  -5.66203009e+03   4.72914699e-01   1.4532e-01   0.10
+ DS11     4.29e+00   4.26e+00   4.26e+00   7.40e+00  -5.66220039e+03  -1.70299816e-01   2.5843e-02   0.10
+ DS12     4.63e+00   4.59e+00   4.59e+00   7.98e+00  -5.66227859e+03  -7.81989233e-02   6.0138e-02   0.09
+ DS13     4.64e+00   4.60e+00   4.60e+00   8.00e+00  -5.66242306e+03  -1.44469498e-01   2.8920e-02   0.10
+ DS14     4.64e+00   4.60e+00   4.60e+00   8.00e+00  -5.66243570e+03  -1.26406010e-02   2.4667e-02   0.09
+ DS15     4.65e+00   4.61e+00   4.61e+00   8.01e+00  -5.66242785e+03   7.85163952e-03   1.4419e-02   0.07
+ DS16     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66242977e+03  -1.92022552e-03   6.7341e-03   0.12
+ DS17     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66241657e+03   1.31921202e-02   4.8540e-03   0.12
+ DS18     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239501e+03   2.15685563e-02   4.0954e-03   0.09
+ DS19     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66238415e+03   1.08550899e-02   1.5128e-03   0.10
+ SCF restart after this step!
+ DS20     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66286314e+03  -4.78985858e-01   1.4389e-04   0.10
+ DS21     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239196e+03   4.71176545e-01   6.4087e-05   0.09
+ DS22     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66238948e+03   2.48237415e-03   5.5053e-06   0.08
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -31999.2856202446        64.7867976142        64.7955475894 
+         64.7867976142    -33600.9735805777       560.6550312603 
+         64.7955475894       560.6550312603    -33600.9824691361 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -33067.080557 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+                   total             2.37   15       0.16   100.00 
+ Driver            atomic_world      2.37   1        2.37   100.00 
+ PW_Basis_Sup      recip2real        0.04   397      0.00   1.70   
+ PSIPrepare        prepare_init      0.03   1        0.03   1.47   
+ psi_init_atomic   tabulate          0.03   1        0.03   1.47   
+ Relax_Driver      relax_driver      2.28   1        2.28   95.97  
+ ESolver_KS        runner            2.24   1        2.24   94.41  
+ Potential         cal_veff          0.18   23       0.01   7.74   
+ PW_Basis_Sup      real2recip        0.04   463      0.00   1.79   
+ PotXC             cal_veff          0.18   23       0.01   7.42   
+ XC_Functional     v_xc              0.19   25       0.01   7.82   
+ ESolver_KS_PW     hamilt2rho_single 1.99   22       0.09   83.75  
+ HSolverPW         solve             1.99   22       0.09   83.74  
+ HSolverPW         solve_psik        1.68   44       0.04   70.83  
+ Diago_DavSubspace diag_once         1.68   44       0.04   70.68  
+ Diago_DavSubspace first             0.64   44       0.01   26.98  
+ Operator          hPsi              1.40   187      0.01   59.08  
+ Operator          veff_pw           1.30   187      0.01   54.94  
+ PW_Basis_K        recip2real        0.80   11858    0.00   33.88  
+ PW_Basis_K        real2recip        0.53   8338     0.00   22.30  
+ Operator          nonlocal_pw       0.08   187      0.00   3.39   
+ Nonlocal          add_nonlocal_pp   0.06   187      0.00   2.47   
+ Diago_DavSubspace cal_elem          0.03   187      0.00   1.30   
+ Diago_DavSubspace diag_zhegvx       0.19   187      0.00   7.87   
+ Diago_DavSubspace cal_grad          0.82   143      0.01   34.33  
+ Diago_DavSubspace last              0.03   86       0.00   1.45   
+ ElecStatePW       psiToRho          0.30   22       0.01   12.59  
+ Stress_PW         cal_stress        0.02   1        0.02   1.01   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 09:53:18 2026
+ FINISH Time  : Sun May  3 09:53:20 2026
+ TOTAL  Time  : 2
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/099_PW_DJ_SO/log_dev_np4.txt b/tests/01_PW/099_PW_DJ_SO/log_dev_np4.txt
new file mode 100644
index 00000000000..3447cc7fe57
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/log_dev_np4.txt
@@ -0,0 +1,123 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 0f9d7d97e (Thu Apr 30 12:48:20 2026 +0800)
+
+ Sun May  3 09:52:57 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 8 for Fe: [Ar] 3d6 4s2
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(0.0337128  SEC) : SETUP UNITCELL
+ DONE(0.0346183  SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       
+ Fe      2           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0401519  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.072056   SEC) : LOCAL POTENTIAL
+ DONE(0.0793844  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.0878906
+ DONE(0.107285   SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.151379   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1     -3.50e-03  -3.47e-03  -3.47e-03   1.94e-01  -5.93364516e+03   0.00000000e+00   6.0771e+01   0.48
+ DS2     -1.98e-02  -1.99e-02  -1.99e-02   5.38e-02  -5.61422933e+03   3.19415829e+02   2.7921e+01   0.22
+ DS3      3.64e-01   3.62e-01   3.63e-01   6.30e-01  -5.66083209e+03  -4.66027629e+01   9.3631e-01   0.12
+ DS4      6.56e-01   6.54e-01   6.54e-01   1.13e+00  -5.66314237e+03  -2.31027985e+00   9.7969e-01   0.13
+ DS5      1.14e+00   1.13e+00   1.13e+00   1.96e+00  -5.66288782e+03   2.54552547e-01   8.4317e-01   0.12
+ DS6      1.54e+00   1.53e+00   1.53e+00   2.66e+00  -5.65330389e+03   9.58392934e+00   6.5624e-01   0.09
+ DS7      3.48e+00   3.45e+00   3.45e+00   6.00e+00  -5.66100392e+03  -7.70002981e+00   3.6100e-01   0.16
+ DS8      4.02e+00   3.98e+00   3.98e+00   6.93e+00  -5.66255040e+03  -1.54648025e+00   3.2295e-01   0.17
+ DS9      4.09e+00   4.05e+00   4.05e+00   7.05e+00  -5.66250306e+03   4.73376558e-02   2.6493e-01   0.13
+ DS10     4.08e+00   4.04e+00   4.04e+00   7.03e+00  -5.66202969e+03   4.73376077e-01   1.4527e-01   0.13
+ DS11     4.29e+00   4.25e+00   4.25e+00   7.40e+00  -5.66220119e+03  -1.71501950e-01   2.5845e-02   0.12
+ DS12     4.64e+00   4.59e+00   4.59e+00   7.98e+00  -5.66227828e+03  -7.70963311e-02   6.0170e-02   0.13
+ DS13     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66242325e+03  -1.44967260e-01   2.8907e-02   0.13
+ DS14     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66243546e+03  -1.22093845e-02   2.4687e-02   0.11
+ DS15     4.65e+00   4.61e+00   4.61e+00   8.01e+00  -5.66242798e+03   7.48036237e-03   1.4412e-02   0.10
+ DS16     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66242955e+03  -1.56605404e-03   6.6989e-03   0.12
+ DS17     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66241570e+03   1.38475343e-02   4.8441e-03   0.06
+ DS18     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239561e+03   2.00926869e-02   4.0264e-03   0.10
+ DS19     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66238472e+03   1.08890233e-02   1.3802e-03   0.07
+ SCF restart after this step!
+ DS20     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66288453e+03  -4.99809949e-01   1.4626e-04   0.11
+ DS21     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239029e+03   4.94239545e-01   3.0808e-04   0.11
+ DS22     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239089e+03  -5.99121242e-04   7.3385e-06   0.10
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -31999.6098887833        65.0629329717        64.9895792749 
+         65.0629329717    -33601.2027303285       560.2485373745 
+         64.9895792749       560.2485373745    -33601.1924915668 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -33067.335037 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+                   total             3.18   15       0.21   100.00 
+ Driver            atomic_world      3.18   1        3.18   100.00 
+ Charge            atomic_rho        0.03   2        0.02   1.07   
+ PW_Basis_Sup      recip2real        0.07   397      0.00   2.06   
+ Relax_Driver      relax_driver      3.07   1        3.07   96.59  
+ ESolver_KS        runner            3.04   1        3.04   95.62  
+ ESolver_KS_PW     before_scf        0.04   1        0.04   1.38   
+ Potential         cal_veff          0.12   23       0.01   3.93   
+ PW_Basis_Sup      real2recip        0.08   463      0.00   2.63   
+ PotXC             cal_veff          0.12   23       0.01   3.73   
+ XC_Functional     v_xc              0.12   25       0.00   3.81   
+ ESolver_KS_PW     hamilt2rho_single 2.80   22       0.13   88.32  
+ HSolverPW         solve             2.80   22       0.13   88.30  
+ HSolverPW         solve_psik        2.38   44       0.05   75.10  
+ Diago_DavSubspace diag_once         2.37   44       0.05   74.74  
+ Diago_DavSubspace first             0.87   44       0.02   27.27  
+ Operator          hPsi              1.87   197      0.01   58.80  
+ Operator          veff_pw           1.76   197      0.01   55.29  
+ PW_Basis_K        recip2real        1.19   11904    0.00   37.44  
+ PW_Basis_K        real2recip        0.93   8384     0.00   29.21  
+ Operator          nonlocal_pw       0.06   197      0.00   2.02   
+ Operator          OnsiteProjPW      0.05   197      0.00   1.44   
+ OnsiteProj        overlap           0.05   241      0.00   1.54   
+ Onsite_Proj_tools cal_becp          0.05   245      0.00   1.60   
+ Diago_DavSubspace cal_elem          0.07   197      0.00   2.23   
+ Diago_DavSubspace diag_zhegvx       0.29   197      0.00   9.01   
+ Diago_DavSubspace cal_grad          1.19   153      0.01   37.36  
+ ElecStatePW       psiToRho          0.41   22       0.02   12.80  
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 09:52:57 2026
+ FINISH Time  : Sun May  3 09:53:00 2026
+ TOTAL  Time  : 3
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/099_PW_DJ_SO/log_final.txt b/tests/01_PW/099_PW_DJ_SO/log_final.txt
new file mode 100644
index 00000000000..5683c705208
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/log_final.txt
@@ -0,0 +1,70 @@
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 11:41:03 2026
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 8 for Fe: [Ar] 3d6 4s2
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(9.884e-06  SEC) : SETUP UNITCELL
+ DONE(0.00292437 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ Fe      2           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.00712453 SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0337331  SEC) : LOCAL POTENTIAL
+ DONE(0.0423397  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.0878906
+ DONE(0.066817   SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.122505   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1     -3.50e-03  -3.47e-03  -3.47e-03   1.94e-01  -5.93364516e+03   0.00000000e+00   6.0771e+01   0.37
+ DS2     -1.98e-02  -1.99e-02  -1.99e-02   5.38e-02  -5.61422933e+03   3.19415829e+02   2.7921e+01   0.19
+ DS3      3.64e-01   3.62e-01   3.63e-01   6.30e-01  -5.66083209e+03  -4.66027629e+01   9.3631e-01   0.50
+ DS4      6.56e-01   6.54e-01   6.54e-01   1.13e+00  -5.66314237e+03  -2.31027985e+00   9.7969e-01   0.20
+ DS5      1.14e+00   1.13e+00   1.13e+00   1.96e+00  -5.66288782e+03   2.54552547e-01   8.4317e-01   0.15
+ DS6      1.54e+00   1.53e+00   1.53e+00   2.66e+00  -5.65330389e+03   9.58392934e+00   6.5624e-01   0.20
+ DS7      3.48e+00   3.45e+00   3.45e+00   6.00e+00  -5.66100392e+03  -7.70002981e+00   3.6100e-01   0.52
+ DS8      4.02e+00   3.98e+00   3.98e+00   6.93e+00  -5.66255040e+03  -1.54648025e+00   3.2295e-01  15.43
+ DS9      4.09e+00   4.05e+00   4.05e+00   7.05e+00  -5.66250306e+03   4.73376558e-02   2.6493e-01  24.33
+ DS10     4.08e+00   4.04e+00   4.04e+00   7.03e+00  -5.66202969e+03   4.73376077e-01   1.4527e-01  22.20
+ DS11     4.29e+00   4.25e+00   4.25e+00   7.40e+00  -5.66220119e+03  -1.71501951e-01   2.5845e-02  26.38
+ DS12     4.64e+00   4.59e+00   4.59e+00   7.98e+00  -5.66227828e+03  -7.70963305e-02   6.0170e-02  29.10
+ DS13     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66242325e+03  -1.44967260e-01   2.8907e-02  29.12
+ DS14     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66243546e+03  -1.22093846e-02   2.4687e-02  23.60
+ DS15     4.65e+00   4.61e+00   4.61e+00   8.01e+00  -5.66242798e+03   7.48036259e-03   1.4412e-02  26.20
+ DS16     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66242955e+03  -1.56605404e-03   6.6989e-03  28.97
+ DS17     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66241570e+03   1.38475345e-02   4.8441e-03  23.27
+ DS18     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239561e+03   2.00926873e-02   4.0264e-03  26.64
diff --git a/tests/01_PW/099_PW_DJ_SO/log_pr_correct.txt b/tests/01_PW/099_PW_DJ_SO/log_pr_correct.txt
new file mode 100644
index 00000000000..5a00ae0ec02
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/log_pr_correct.txt
@@ -0,0 +1,60 @@
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 11:32:44 2026
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 8 for Fe: [Ar] 3d6 4s2
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(9.998e-06  SEC) : SETUP UNITCELL
+ DONE(0.000129725 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ Fe      2           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.00638782 SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0215191  SEC) : LOCAL POTENTIAL
+ DONE(0.0357419  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.0878906
+ DONE(0.0607546  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.0915078  SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1     -3.50e-03  -3.47e-03  -3.47e-03   1.94e-01  -5.93364516e+03   0.00000000e+00   6.0771e+01   0.92
+ DS2     -1.98e-02  -1.99e-02  -1.99e-02   5.38e-02  -5.61422933e+03   3.19415829e+02   2.7921e+01   0.56
+ DS3      3.64e-01   3.62e-01   3.63e-01   6.30e-01  -5.66083209e+03  -4.66027629e+01   9.3631e-01   0.33
+ DS4      6.56e-01   6.54e-01   6.54e-01   1.13e+00  -5.66314237e+03  -2.31027985e+00   9.7969e-01   0.47
+ DS5      1.14e+00   1.13e+00   1.13e+00   1.96e+00  -5.66288782e+03   2.54552547e-01   8.4317e-01  14.08
+ DS6      1.54e+00   1.53e+00   1.53e+00   2.66e+00  -5.65330389e+03   9.58392934e+00   6.5624e-01  24.04
+ DS7      3.48e+00   3.45e+00   3.45e+00   6.00e+00  -5.66100392e+03  -7.70002981e+00   3.6100e-01  34.89
+ DS8      4.02e+00   3.98e+00   3.98e+00   6.93e+00  -5.66255040e+03  -1.54648025e+00   3.2295e-01  28.96
diff --git a/tests/01_PW/099_PW_DJ_SO/log_pr_fixed.txt b/tests/01_PW/099_PW_DJ_SO/log_pr_fixed.txt
new file mode 100644
index 00000000000..acb5dca1422
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/log_pr_fixed.txt
@@ -0,0 +1,122 @@
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 09:57:03 2026
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 8 for Fe: [Ar] 3d6 4s2
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(1.1268e-05 SEC) : SETUP UNITCELL
+ DONE(0.00231719 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ Fe      2           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0125158  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0251662  SEC) : LOCAL POTENTIAL
+ DONE(0.0328194  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.0878906
+ DONE(0.0604581  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.0907335  SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1     -3.50e-03  -3.47e-03  -3.47e-03   1.94e-01  -5.93364516e+03   0.00000000e+00   6.0771e+01   0.26
+ DS2     -1.98e-02  -1.99e-02  -1.99e-02   5.38e-02  -5.61422933e+03   3.19415829e+02   2.7921e+01   0.17
+ DS3      3.64e-01   3.62e-01   3.63e-01   6.30e-01  -5.66083209e+03  -4.66027629e+01   9.3631e-01   0.15
+ DS4      6.56e-01   6.54e-01   6.54e-01   1.13e+00  -5.66314237e+03  -2.31027985e+00   9.7969e-01   0.14
+ DS5      1.14e+00   1.13e+00   1.13e+00   1.96e+00  -5.66288782e+03   2.54552547e-01   8.4317e-01   0.11
+ DS6      1.54e+00   1.53e+00   1.53e+00   2.66e+00  -5.65330389e+03   9.58392934e+00   6.5624e-01   0.27
+ DS7      3.48e+00   3.45e+00   3.45e+00   6.00e+00  -5.66100392e+03  -7.70002981e+00   3.6100e-01   0.16
+ DS8      4.02e+00   3.98e+00   3.98e+00   6.93e+00  -5.66255040e+03  -1.54648025e+00   3.2295e-01   0.12
+ DS9      4.09e+00   4.05e+00   4.05e+00   7.05e+00  -5.66250306e+03   4.73376558e-02   2.6493e-01   0.10
+ DS10     4.08e+00   4.04e+00   4.04e+00   7.03e+00  -5.66202969e+03   4.73376077e-01   1.4527e-01   0.12
+ DS11     4.29e+00   4.25e+00   4.25e+00   7.40e+00  -5.66220119e+03  -1.71501950e-01   2.5845e-02   0.12
+ DS12     4.64e+00   4.59e+00   4.59e+00   7.98e+00  -5.66227828e+03  -7.70963308e-02   6.0170e-02   0.16
+ DS13     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66242325e+03  -1.44967260e-01   2.8907e-02   0.11
+ DS14     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66243546e+03  -1.22093845e-02   2.4687e-02   0.14
+ DS15     4.65e+00   4.61e+00   4.61e+00   8.01e+00  -5.66242798e+03   7.48036244e-03   1.4412e-02   0.12
+ DS16     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66242955e+03  -1.56605404e-03   6.6989e-03   0.12
+ DS17     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66241570e+03   1.38475344e-02   4.8441e-03   0.07
+ DS18     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239561e+03   2.00926871e-02   4.0264e-03   0.12
+ DS19     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66238472e+03   1.08890228e-02   1.3802e-03   0.10
+ SCF restart after this step!
+ DS20     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66288453e+03  -4.99809949e-01   1.4626e-04   0.12
+ DS21     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239029e+03   4.94239547e-01   3.0808e-04   0.16
+ DS22     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239089e+03  -5.99121350e-04   7.3385e-06   0.09
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -31999.5520569430        65.0633550480        64.9894611795 
+         65.0633550480    -33601.1727637891       560.2487427657 
+         64.9894611795       560.2487427657    -33601.1336857629 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -33067.286169 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      3.19   1        3.19   100.00 
+                   total             3.15   14       0.23   98.96  
+ PW_Basis_Sup      recip2real        0.05   397      0.00   1.72   
+ Relax_Driver      relax_driver      3.09   1        3.09   97.03  
+ ESolver_KS        runner            3.05   1        3.05   95.77  
+ Potential         cal_veff          0.10   23       0.00   3.15   
+ PW_Basis_Sup      real2recip        0.08   463      0.00   2.56   
+ PotXC             cal_veff          0.09   23       0.00   2.83   
+ XC_Functional     v_xc              0.10   25       0.00   3.15   
+ ESolver_KS_PW     hamilt2rho_single 2.84   22       0.13   89.23  
+ HSolverPW         solve             2.84   22       0.13   89.22  
+ HSolverPW         solve_psik        2.39   44       0.05   75.08  
+ Diago_DavSubspace diag_once         2.38   44       0.05   74.71  
+ Diago_DavSubspace first             0.75   44       0.02   23.52  
+ Operator          hPsi              1.83   197      0.01   57.52  
+ Operator          veff_pw           1.71   197      0.01   53.67  
+ PW_Basis_K        recip2real        1.19   11904    0.00   37.43  
+ PW_Basis_K        real2recip        0.90   8384     0.00   28.31  
+ Operator          nonlocal_pw       0.07   197      0.00   2.22   
+ Operator          OnsiteProjPW      0.05   197      0.00   1.58   
+ OnsiteProj        overlap           0.05   241      0.00   1.46   
+ Onsite_Proj_tools cal_becp          0.05   245      0.00   1.47   
+ Diago_DavSubspace cal_elem          0.06   197      0.00   1.90   
+ Diago_DavSubspace diag_zhegvx       0.32   197      0.00   9.93   
+ Diago_DavSubspace cal_grad          1.28   153      0.01   40.24  
+ ElecStatePW       psiToRho          0.43   22       0.02   13.61  
+ Charge_Mixing     get_drho          0.03   22       0.00   1.08   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 09:57:03 2026
+ FINISH Time  : Sun May  3 09:57:06 2026
+ TOTAL  Time  : 3
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/099_PW_DJ_SO/log_pr_np4.txt b/tests/01_PW/099_PW_DJ_SO/log_pr_np4.txt
new file mode 100644
index 00000000000..6acad8020ab
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/log_pr_np4.txt
@@ -0,0 +1,123 @@
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 55690612c (Sat May 2 13:10:55 2026 +0800)
+
+ Sun May  3 09:54:07 2026
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 8 for Fe: [Ar] 3d6 4s2
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(8.1e-06    SEC) : SETUP UNITCELL
+ DONE(0.00118406 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ Fe      2           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.00644934 SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0128259  SEC) : LOCAL POTENTIAL
+ DONE(0.020645   SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.0878906
+ DONE(0.0450215  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.0919448  SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1     -3.50e-03  -3.47e-03  -3.47e-03   1.94e-01  -5.93364516e+03   0.00000000e+00   6.0771e+01   0.36
+ DS2     -1.98e-02  -1.99e-02  -1.99e-02   5.38e-02  -5.61422933e+03   3.19415829e+02   2.7921e+01   0.18
+ DS3      3.64e-01   3.62e-01   3.63e-01   6.30e-01  -5.66083209e+03  -4.66027629e+01   9.3631e-01   0.21
+ DS4      6.56e-01   6.54e-01   6.54e-01   1.13e+00  -5.66314237e+03  -2.31027985e+00   9.7969e-01   0.14
+ DS5      1.14e+00   1.13e+00   1.13e+00   1.96e+00  -5.66288782e+03   2.54552547e-01   8.4317e-01   0.37
+ DS6      1.54e+00   1.53e+00   1.53e+00   2.66e+00  -5.65330389e+03   9.58392934e+00   6.5624e-01   0.23
+ DS7      3.48e+00   3.45e+00   3.45e+00   6.00e+00  -5.66100392e+03  -7.70002981e+00   3.6100e-01   0.66
+ DS8      4.02e+00   3.98e+00   3.98e+00   6.93e+00  -5.66255040e+03  -1.54648025e+00   3.2295e-01   1.53
+ DS9      4.09e+00   4.05e+00   4.05e+00   7.05e+00  -5.66250306e+03   4.73376558e-02   2.6493e-01   0.18
+ DS10     4.08e+00   4.04e+00   4.04e+00   7.03e+00  -5.66202969e+03   4.73376077e-01   1.4527e-01   0.14
+ DS11     4.29e+00   4.25e+00   4.25e+00   7.40e+00  -5.66220119e+03  -1.71501951e-01   2.5845e-02   0.12
+ DS12     4.64e+00   4.59e+00   4.59e+00   7.98e+00  -5.66227828e+03  -7.70963306e-02   6.0170e-02   0.18
+ DS13     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66242325e+03  -1.44967260e-01   2.8907e-02   0.12
+ DS14     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66243546e+03  -1.22093846e-02   2.4687e-02   0.14
+ DS15     4.65e+00   4.61e+00   4.61e+00   8.01e+00  -5.66242798e+03   7.48036252e-03   1.4412e-02   0.09
+ DS16     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66242955e+03  -1.56605403e-03   6.6989e-03   0.15
+ DS17     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66241570e+03   1.38475345e-02   4.8441e-03   0.15
+ DS18     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239561e+03   2.00926872e-02   4.0264e-03   0.12
+ DS19     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66238472e+03   1.08890224e-02   1.3802e-03   0.14
+ SCF restart after this step!
+ DS20     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66288453e+03  -4.99809950e-01   1.4626e-04   0.15
+ DS21     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239029e+03   4.94239548e-01   3.0808e-04   0.16
+ DS22     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239089e+03  -5.99121450e-04   7.3385e-06   0.10
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -32078.3250525856        67.5008795626        67.4184104029 
+         67.5008795626    -33686.4942094489       559.5736765290 
+         67.4184104029       559.5736765290    -33686.4455147576 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -33150.421592 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      5.80   1        5.80   100.00 
+                   total             5.76   14       0.41   99.38  
+ PW_Basis_Sup      recip2real        0.12   397      0.00   2.06   
+ Relax_Driver      relax_driver      5.71   1        5.71   98.56  
+ ESolver_KS        runner            5.66   1        5.66   97.70  
+ Potential         cal_veff          0.21   23       0.01   3.66   
+ PW_Basis_Sup      real2recip        0.15   463      0.00   2.56   
+ PotXC             cal_veff          0.19   23       0.01   3.36   
+ XC_Functional     v_xc              0.21   25       0.01   3.58   
+ ESolver_KS_PW     hamilt2rho_single 5.26   22       0.24   90.80  
+ HSolverPW         solve             5.26   22       0.24   90.80  
+ HSolverPW         solve_psik        4.64   44       0.11   80.07  
+ Diago_DavSubspace diag_once         4.62   44       0.11   79.72  
+ Diago_DavSubspace first             1.08   44       0.02   18.64  
+ Operator          hPsi              3.84   197      0.02   66.23  
+ Operator          veff_pw           3.65   197      0.02   62.92  
+ PW_Basis_K        recip2real        2.26   11904    0.00   38.95  
+ PW_Basis_K        real2recip        1.93   8384     0.00   33.30  
+ Operator          nonlocal_pw       0.11   197      0.00   1.85   
+ Operator          OnsiteProjPW      0.08   197      0.00   1.41   
+ OnsiteProj        overlap           0.08   241      0.00   1.35   
+ Onsite_Proj_tools cal_becp          0.08   245      0.00   1.42   
+ Diago_DavSubspace cal_elem          0.06   197      0.00   1.09   
+ Diago_DavSubspace diag_zhegvx       0.32   197      0.00   5.45   
+ Diago_DavSubspace cal_grad          3.20   153      0.02   55.12  
+ ElecStatePW       psiToRho          0.60   22       0.03   10.32  
+ Charge_Mixing     get_drho          0.06   22       0.00   1.08   
+ Charge_Mixing     mix_rho           0.06   20       0.00   1.04   
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 09:54:07 2026
+ FINISH Time  : Sun May  3 09:54:13 2026
+ TOTAL  Time  : 6
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/099_PW_DJ_SO/log_v2.txt b/tests/01_PW/099_PW_DJ_SO/log_v2.txt
new file mode 100644
index 00000000000..89fb998774a
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/log_v2.txt
@@ -0,0 +1,121 @@
+                                                                                     
+                              ABACUS v3.11.0-beta.1
+
+               Atomic-orbital Based Ab-initio Computation at UStc                    
+
+                     Website: http://abacus.ustc.edu.cn/                             
+               Documentation: https://abacus.deepmodeling.com/                       
+                  Repository: https://github.com/abacusmodeling/abacus-develop       
+                              https://github.com/deepmodeling/abacus-develop         
+                      Commit: 5837a6526 (Sun May 3 09:44:20 2026 +0800)
+
+ Sun May  3 11:37:55 2026
+Info: Local MPI proc number: 4,OpenMP thread number: 3,Total thread number: 12,Local thread limit: 14
+ MAKE THE DIR         : OUT.autotest/
+ RUNNING WITH DEVICE  : CPU / Intel(R) Core(TM) Ultra 5 225H (x1)
+ WARNING: some of potential function is set to zero cause of less than 1e-30.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+ Warning: the number of valence electrons in pseudopotential > 8 for Fe: [Ar] 3d6 4s2
+ Pseudopotentials with additional electrons can yield (more) accurate outcomes, but may be less efficient.
+ If you're confident that your chosen pseudopotential is appropriate, you can safely ignore this warning.
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+ UNIFORM GRID DIM     : 24 * 24 * 24
+ UNIFORM GRID DIM(BIG): 24 * 24 * 24
+ DONE(9.762e-06  SEC) : SETUP UNITCELL
+ DONE(0.00206507 SEC) : INIT K-POINTS
+ ----------------------------------------------------------------
+ Self-consistent calculations for electrons
+ ----------------------------------------------------------------
+ SPIN    KPOINTS         PROCESSES   THREADS/PROC  THREADS/TOTAL 
+ 4       2               4           3             12            
+ ----------------------------------------------------------------
+ Use plane wave basis
+ ----------------------------------------------------------------
+ ELEMENT NATOM       XC          
+ Fe      2           
+ ----------------------------------------------------------------
+ Initial plane wave basis and FFT box
+ ----------------------------------------------------------------
+ DONE(0.0107033  SEC) : INIT PLANEWAVE
+ START CHARGE         : atomic
+ DONE(0.0343627  SEC) : LOCAL POTENTIAL
+ DONE(0.0477787  SEC) : NON-LOCAL POTENTIAL
+ MEMORY FOR PSI (MB)  : 0.0878906
+ DONE(0.0753504  SEC) : INIT BASIS
+
+ ================================================================
+ SELF-CONSISTENT: 
+ ================================================================
+ DONE(0.112708   SEC) : INIT SCF
+ ITER     TMAGX      TMAGY      TMAGZ       AMAG        ETOT/eV          EDIFF/eV         DRHO     TIME/s
+ DS1     -3.50e-03  -3.47e-03  -3.47e-03   1.94e-01  -5.93364516e+03   0.00000000e+00   6.0771e+01   0.64
+ DS2     -1.98e-02  -1.99e-02  -1.99e-02   5.38e-02  -5.61422933e+03   3.19415829e+02   2.7921e+01   0.83
+ DS3      3.64e-01   3.62e-01   3.63e-01   6.30e-01  -5.66083209e+03  -4.66027629e+01   9.3631e-01   0.72
+ DS4      6.56e-01   6.54e-01   6.54e-01   1.13e+00  -5.66314237e+03  -2.31027985e+00   9.7969e-01   1.36
+ DS5      1.14e+00   1.13e+00   1.13e+00   1.96e+00  -5.66288782e+03   2.54552547e-01   8.4317e-01   0.35
+ DS6      1.54e+00   1.53e+00   1.53e+00   2.66e+00  -5.65330389e+03   9.58392934e+00   6.5624e-01   0.22
+ DS7      3.48e+00   3.45e+00   3.45e+00   6.00e+00  -5.66100392e+03  -7.70002981e+00   3.6100e-01   0.23
+ DS8      4.02e+00   3.98e+00   3.98e+00   6.93e+00  -5.66255040e+03  -1.54648025e+00   3.2295e-01   0.21
+ DS9      4.09e+00   4.05e+00   4.05e+00   7.05e+00  -5.66250306e+03   4.73376558e-02   2.6493e-01   0.16
+ DS10     4.08e+00   4.04e+00   4.04e+00   7.03e+00  -5.66202969e+03   4.73376077e-01   1.4527e-01   0.21
+ DS11     4.29e+00   4.25e+00   4.25e+00   7.40e+00  -5.66220119e+03  -1.71501951e-01   2.5845e-02   0.13
+ DS12     4.64e+00   4.59e+00   4.59e+00   7.98e+00  -5.66227828e+03  -7.70963304e-02   6.0170e-02   0.18
+ DS13     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66242325e+03  -1.44967260e-01   2.8907e-02   0.18
+ DS14     4.65e+00   4.60e+00   4.60e+00   8.00e+00  -5.66243546e+03  -1.22093846e-02   2.4687e-02   0.17
+ DS15     4.65e+00   4.61e+00   4.61e+00   8.01e+00  -5.66242798e+03   7.48036260e-03   1.4412e-02   0.18
+ DS16     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66242955e+03  -1.56605403e-03   6.6989e-03   0.44
+ DS17     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66241570e+03   1.38475346e-02   4.8441e-03   0.22
+ DS18     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239561e+03   2.00926873e-02   4.0264e-03   0.15
+ DS19     4.65e+00   4.61e+00   4.61e+00   8.02e+00  -5.66238472e+03   1.08890221e-02   1.3802e-03   0.17
+ SCF restart after this step!
+ DS20     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66288453e+03  -4.99809951e-01   1.4626e-04   0.19
+ DS21     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239029e+03   4.94239549e-01   3.0808e-04   0.24
+ DS22     4.66e+00   4.61e+00   4.61e+00   8.02e+00  -5.66239089e+03  -5.99121512e-04   7.3385e-06   0.14
+ ----------------------------------------------------------------
+              Stress_x             Stress_y             Stress_z 
+ ----------------------------------------------------------------
+     -32078.3250525754        67.5008795602        67.4184104099 
+         67.5008795602    -33686.4942094585       559.5736765351 
+         67.4184104099       559.5736765351    -33686.4455147623 
+ ----------------------------------------------------------------
+ TOTAL-PRESSURE (EXCLUDE KINETIC PART OF IONS): -33150.421592 kbar
+
+ TIME STATISTICS
+-------------------------------------------------------------------
+    CLASS_NAME           NAME        TIME/s  CALLS   AVG/s  PER/%  
+-------------------------------------------------------------------
+ Driver            atomic_world      7.53   1        7.53   100.00 
+                   total             7.50   14       0.54   99.53  
+ PW_Basis_Sup      recip2real        0.10   397      0.00   1.38   
+ Relax_Driver      relax_driver      7.42   1        7.42   98.52  
+ ESolver_KS        runner            7.37   1        7.37   97.84  
+ Potential         cal_veff          0.22   23       0.01   2.92   
+ PW_Basis_Sup      real2recip        0.18   463      0.00   2.41   
+ PotXC             cal_veff          0.18   23       0.01   2.44   
+ XC_Functional     v_xc              0.20   25       0.01   2.68   
+ ESolver_KS_PW     hamilt2rho_single 6.98   22       0.32   92.63  
+ HSolverPW         solve             6.98   22       0.32   92.63  
+ HSolverPW         solve_psik        5.95   44       0.14   78.96  
+ Diago_DavSubspace diag_once         5.93   44       0.13   78.74  
+ Diago_DavSubspace first             2.13   44       0.05   28.26  
+ Operator          hPsi              5.07   197      0.03   67.33  
+ Operator          veff_pw           4.86   197      0.02   64.56  
+ PW_Basis_K        recip2real        3.29   11904    0.00   43.70  
+ PW_Basis_K        real2recip        2.50   8384     0.00   33.26  
+ Operator          nonlocal_pw       0.09   197      0.00   1.25   
+ Operator          OnsiteProjPW      0.11   197      0.00   1.48   
+ OnsiteProj        overlap           0.12   241      0.00   1.64   
+ Onsite_Proj_tools cal_becp          0.12   245      0.00   1.66   
+ Diago_DavSubspace cal_elem          0.15   197      0.00   1.95   
+ Diago_DavSubspace diag_zhegvx       0.37   197      0.00   4.96   
+ Diago_DavSubspace cal_grad          3.33   153      0.02   44.21  
+ ElecStatePW       psiToRho          1.01   22       0.05   13.41  
+-------------------------------------------------------------------
+
+
+ START  Time  : Sun May  3 11:37:55 2026
+ FINISH Time  : Sun May  3 11:38:03 2026
+ TOTAL  Time  : 8
+ SEE INFORMATION IN : OUT.autotest/
diff --git a/tests/01_PW/099_PW_DJ_SO/result_dev_np1.out b/tests/01_PW/099_PW_DJ_SO/result_dev_np1.out
new file mode 100644
index 00000000000..7712d6b3f76
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/result_dev_np1.out
@@ -0,0 +1,5 @@
+etotref -5662.3894775916605795
+etotperatomref -2831.1947387958
+totalforceref 17.718002
+totalstressref 100581.716424
+totaltimeref 2.37
diff --git a/tests/01_PW/099_PW_DJ_SO/result_dev_np4.out b/tests/01_PW/099_PW_DJ_SO/result_dev_np4.out
new file mode 100644
index 00000000000..a24ab3f48b2
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/result_dev_np4.out
@@ -0,0 +1,5 @@
+etotref -5662.3908859906132420
+etotperatomref -2831.1954429953
+totalforceref 17.965510
+totalstressref 100582.607209
+totaltimeref 3.18
diff --git a/tests/01_PW/099_PW_DJ_SO/result_final.out b/tests/01_PW/099_PW_DJ_SO/result_final.out
new file mode 100644
index 00000000000..797117b6d0c
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/result_final.out
@@ -0,0 +1,5 @@
+etotref 
+etotperatomref 
+totalforceref 0.0
+totalstressref 0.0
+totaltimeref 
diff --git a/tests/01_PW/099_PW_DJ_SO/result_pr_fixed.out b/tests/01_PW/099_PW_DJ_SO/result_pr_fixed.out
new file mode 100644
index 00000000000..417295da7fa
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/result_pr_fixed.out
@@ -0,0 +1,5 @@
+etotref -5662.3908859905586723
+etotperatomref -2831.1954429953
+totalforceref 17.965520
+totalstressref 100582.461625
+totaltimeref 3.19
diff --git a/tests/01_PW/099_PW_DJ_SO/result_pr_np4.out b/tests/01_PW/099_PW_DJ_SO/result_pr_np4.out
new file mode 100644
index 00000000000..43e7f0ff4f8
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/result_pr_np4.out
@@ -0,0 +1,5 @@
+etotref -5662.3908859905150166
+etotperatomref -2831.1954429953
+totalforceref 17.963892
+totalstressref 100840.250711
+totaltimeref 5.80
diff --git a/tests/01_PW/099_PW_DJ_SO/result_v2.out b/tests/01_PW/099_PW_DJ_SO/result_v2.out
new file mode 100644
index 00000000000..fa945c71015
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/result_v2.out
@@ -0,0 +1,5 @@
+etotref -5662.3908859906141515
+etotperatomref -2831.1954429953
+totalforceref 17.963892
+totalstressref 100840.250711
+totaltimeref 4.14
diff --git a/tests/01_PW/099_PW_DJ_SO/result_v2_check.out b/tests/01_PW/099_PW_DJ_SO/result_v2_check.out
new file mode 100644
index 00000000000..595310827fc
--- /dev/null
+++ b/tests/01_PW/099_PW_DJ_SO/result_v2_check.out
@@ -0,0 +1,5 @@
+etotref -5662.3908859904895507
+etotperatomref -2831.1954429952
+totalforceref 17.963892
+totalstressref 100840.250711
+totaltimeref 7.53
diff --git a/tests/01_PW/210_PW_kspace_shift/KPT b/tests/01_PW/210_PW_kspace_shift/KPT
new file mode 100644
index 00000000000..e54805b822e
--- /dev/null
+++ b/tests/01_PW/210_PW_kspace_shift/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 2 1 0.5 0.5 0.5
diff --git a/tests/01_PW/BUG_SCF_DSPIN/INPUT b/tests/01_PW/BUG_SCF_DSPIN/INPUT
index 94350f4eef1..bce9b733a90 100644
--- a/tests/01_PW/BUG_SCF_DSPIN/INPUT
+++ b/tests/01_PW/BUG_SCF_DSPIN/INPUT
@@ -29,7 +29,6 @@ decay_grad_switch  1      # switch to control gradient break condition
 sc_thr             1e-7   # Convergence criterion of spin-constrained iteration (RMS) in uB
 nsc                150    # Maximal number of spin-constrained iteration
 nsc_min            2      # Minimum number of spin-constrained iteration 
-sc_scf_nmin        2      # Minimum number of outer scf loop before initializing lambda loop
 alpha_trial        0.01   # Initial trial step size for lambda in eV/uB^2
 sccut              3      # Maximal step size for lambda in eV/uB
 sc_drop_thr        1.0e-2 # Convergence criterion ratio of lambda iteration in Spin-constrained DFT
diff --git a/tests/01_PW/test.sum b/tests/01_PW/test.sum
new file mode 100644
index 00000000000..d8420721227
--- /dev/null
+++ b/tests/01_PW/test.sum
@@ -0,0 +1,129 @@
+nscf_out_pot 1
+scf_out_elf 1
+scf_out_ldos 1
+scf_out_chg_tau 1
+001_PW_UPF100_Al 1
+002_PW_UPF100_RAPPE_Fe 1
+003_PW_UPF100_USPP_Fe 1
+004_PW_UPF201_Si 1
+005_PW_UPF201_UPF100 1
+006_PW_UPF201_Eu 1
+007_PW_UPF201_USPP_Fe 1
+008_PW_UPF201_USPP_NaCl 1
+009_PW_UPF201_USPP 1
+010_PW_0TYPE 1
+011_PW_0ATOM 1
+012_PW_DJ 1
+013_PW_ONCV_LDA 1
+014_PW_UPF201_BLPS 1
+015_PW_GTH 1
+016_PW_BLPS 1
+017_PW_LPS6 1
+018_PW_LPS8 1
+019_PW_Coulomb 1
+020_PW_kspace 1
+021_PW_kspace3 1
+022_PW_CG 1
+023_PW_DA 1
+024_PW_DS 1
+025_PW_DS_sca 1
+026_PW_KPAR 1
+027_PW_PINT_RKS 1
+028_PW_PINT_UKS 1
+029_PW_15_CF_CS_S1_smallg 1
+030_PW_15_CF_CS_S2_smallg 1
+031_PW_15_CF_CS 1
+032_PW_15_CF_CS_bspline 1
+033_PW_CF_CS_S1_smallg 1
+034_PW_CF_CS_S2_smallg 1
+035_PW_15_SO 1
+036_PW_AF 1
+037_PW_FM 1
+038_PW_NC 1
+039_PW_FD_smear 1
+040_PW_FX_smear 1
+041_PW_GA_smear 1
+042_PW_M2_smear 1
+043_PW_MP_smear 1
+044_PW_MV_smear 1
+045_PW_BD_chgmix 1
+046_PW_KK_chgmix 1
+047_PW_PK_chgmix 1
+048_PW_PL_chgmix 1
+049_PW_PU_chgmix 1
+050_PW_CHG_mismatch 1
+051_PW_OBOD_MemSaver 1
+052_PW_OB 1
+053_PW_OD 1
+055_PW_OW 1
+056_PW_IW 1
+057_PW_SO_IW 1
+058_PW_RE_MB 1
+059_PW_RE_MB_traj 1
+060_PW_RE_MG 1
+061_PW_RE_NEW 1
+062_PW_RE_PINT_RKS 1
+063_PW_CR 1
+064_PW_CR_fix_a 1
+065_PW_CR_fix_ab 1
+066_PW_CR_fix_abc 1
+067_PW_CR_fix_ac 1
+068_PW_CR_fix_b 1
+069_PW_CR_fix_bc 1
+070_PW_CR_fix_c 1
+071_PW_CR_move 1
+073_PW_SY 1
+074_PW_SY_LiRH 1
+075_PW_CHG_BINARY 1
+076_PW_elec_add 1
+077_PW_elec_minus 1
+078_PW_S2_elec_add 1
+079_PW_S2_elec_minus 1
+080_PW_dipole 1
+081_PW_efield 1
+082_PW_gatefield 1
+083_PW_sol_H2 1
+084_PW_sol_H2O 1
+085_PW_get_pchg 1
+086_PW_get_wf 1
+087_PW_get_pchg_kpar 1
+088_PW_get_pchg_sepk 1
+089_PW_get_wf_kpar 1
+090_PW_VWR 1
+091_PW_CR_VDW3 1
+092_PW_MSST 1
+093_PW_MSST2 1
+094_PW_NPT 1
+095_PW_NVT 1
+096_PW_PBE0 1
+096_PW_PBE0_AFM 0
+096_PW_PBE0_FM 1
+098_PW_15_SO_avg 1
+099_PW_DJ_SO 1
+100_PW_W90 1
+101_PW_MD_1O 1
+102_PW_MD_2O 1
+201_PW_UPF201_Ce_f 1
+202_PW_ONCV_Libxc 1
+204_PW_SY 1
+205_PW_SCAN 1
+206_PW_SCAN_S2 1
+207_PW_skip 1
+208_PW_CG_float 1
+209_PW_DFTHALF 1
+210_PW_kspace_shift 1
+801_PW_LT_sc 1
+802_PW_LT_fcc 1
+803_PW_LT_bcc 1
+804_PW_LT_hex 1
+805_PW_LT_trigonal 1
+806_PW_LT_st 1
+807_PW_LT_bct 1
+808_PW_LT_so 1
+809_PW_LT_baco 1
+810_PW_LT_fco 1
+811_PW_LT_bco 1
+812_PW_LT_sm 1
+813_PW_LT_bacm 1
+814_PW_LT_triclinic 1
+
diff --git a/tests/02_NAO_Gamma/md_msst/KPT b/tests/02_NAO_Gamma/md_msst/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/md_msst/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/md_out_hk_spin2/KPT b/tests/02_NAO_Gamma/md_out_hk_spin2/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/md_out_hk_spin2/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/md_out_hk_syns/KPT b/tests/02_NAO_Gamma/md_out_hk_syns/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/md_out_hk_syns/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/relax_out_hk/KPT b/tests/02_NAO_Gamma/relax_out_hk/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/relax_out_hk/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/relax_out_hk_spin2/KPT b/tests/02_NAO_Gamma/relax_out_hk_spin2/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/relax_out_hk_spin2/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/scf_metagga/KPT b/tests/02_NAO_Gamma/scf_metagga/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/scf_metagga/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/scf_out_hk/KPT b/tests/02_NAO_Gamma/scf_out_hk/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/scf_out_hk/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/scf_out_hk_spin2/KPT b/tests/02_NAO_Gamma/scf_out_hk_spin2/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/02_NAO_Gamma/scf_out_hk_spin2/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/02_NAO_Gamma/test.sum b/tests/02_NAO_Gamma/test.sum
new file mode 100644
index 00000000000..48aa711d30d
--- /dev/null
+++ b/tests/02_NAO_Gamma/test.sum
@@ -0,0 +1,32 @@
+get_pchg 1
+get_wf_spin2 1
+md_msst 1
+md_msst2 1
+md_out_hk_syns 1
+md_out_hk_spin2 1
+relax_bfgs2 1
+relax_cell 1
+relax_old_cg 1
+relax_out_hk 1
+relax_out_hk_spin2 1
+scf_afm 1
+scf_bsse 1
+scf_elenum 1
+scf_elenum_spin2 1
+scf_FeBiTe 1
+scf_fm 1
+scf_force_stress 1
+scf_in_wf 1
+scf_metagga 1
+scf_ocp_spin2 1
+scf_out_dm 1
+scf_out_hk 1
+scf_out_hk_spin2 1
+scf_out_hxc 1
+scf_out_mul 1
+scf_out_mul_spin2 1
+scf_out_wf 1
+scf_solvation 1
+scf_u_spin2 1
+scf_upf100 1
+
diff --git a/tests/03_NAO_multik/99_SCF_DPSIN/INPUT b/tests/03_NAO_multik/99_SCF_DPSIN/INPUT
index 499059e1ff3..c3a1a20aca3 100644
--- a/tests/03_NAO_multik/99_SCF_DPSIN/INPUT
+++ b/tests/03_NAO_multik/99_SCF_DPSIN/INPUT
@@ -30,7 +30,6 @@ decay_grad_switch  1      # switch to control gradient break condition
 sc_thr             1e-7   # Convergence criterion of spin-constrained iteration (RMS) in uB
 nsc                150    # Maximal number of spin-constrained iteration
 nsc_min            2      # Minimum number of spin-constrained iteration 
-sc_scf_nmin        2      # Minimum number of outer scf loop before initializing lambda loop
 alpha_trial        0.01   # Initial trial step size for lambda in eV/uB^2
 sccut              3      # Maximal step size for lambda in eV/uB
 sc_drop_thr        1.0e-2 # Convergence criterion ratio of lambda iteration in Spin-constrained DFT
diff --git a/tests/03_NAO_multik/test.sum b/tests/03_NAO_multik/test.sum
new file mode 100644
index 00000000000..308a62f6203
--- /dev/null
+++ b/tests/03_NAO_multik/test.sum
@@ -0,0 +1,62 @@
+scf_pp_upf1 1
+scf_pp_upf201 1
+scf_pp_sg15 1
+scf_pp_gth 1
+scf_angle_spin4 1
+scf_u_spin1 1
+scf_u_spin2 1
+scf_u_spin4 1
+scf_u_ramp 1
+scf_vdw3abc 1
+scf_solvation 1
+scf_eadd 1
+scf_eadd_spin2 1
+scf_eminus 1
+scf_eminus_spin2 1
+scf_bspline 1
+scf_0atoms 1
+scf_symm_prec 1
+scf_smallg_spin1 1
+scf_smallg_spin2 1
+scf_in_dmr_GaAs 1
+scf_in_dmr_Si 1
+scf_in_restart 1
+scf_out_dos_spin2 1
+scf_out_wf 1
+scf_out_chg_pot1 1
+scf_out_chg_tau 1
+scf_out_pot3 1
+scf_out_dmr_dmk 1
+scf_out_hsk 1
+scf_out_hsr 1
+scf_out_hsr_spin4 1
+scf_out_dh_t 1
+scf_out_dos_spin4 1
+scf_out_mul 1
+scf_out_mul_spin2 1
+scf_out_mul_spin4 1
+scf_out_mul_nupdw 1
+scf_out_elf 1
+scf_out_qo 1
+nscf_out_dos 1
+nscf_out_band_pband 1
+nscf_out_pot1 1
+nscf_out_mul 1
+nscf_out_hsr_tr_rr 1
+relax_bfgs2 1
+relax_old_cg 1
+relax_cell 1
+relax_cell_vdw2 1
+relax_cell_vdw3 1
+relax_cell_vdw3bj 1
+md_nvt 1
+md_msst 1
+md_out_wf 1
+md_out_syns 1
+md_chg_extra 1
+get_wf 1
+get_wf0 1
+get_pchg 1
+get_pchg_k 1
+get_s 1
+
diff --git a/tests/04_FF/01_LJ_Anderson/force.txt b/tests/04_FF/01_LJ_Anderson/force.txt
new file mode 100644
index 00000000000..86500b57fe3
--- /dev/null
+++ b/tests/04_FF/01_LJ_Anderson/force.txt
@@ -0,0 +1,32 @@
+-0.0399301391 0.0247854527 -0.0000033359
+-0.0052853095 0.0380086510 -0.0579946199
+0.0832612534 -0.0007009088 0.0000664701
+-0.0545684418 0.0655930238 -0.0077825543
+-0.0066320802 -0.0476547901 0.0316565979
+-0.0037479395 -0.0187909823 0.0091682353
+0.0486345682 0.0047097141 -0.0052037143
+-0.0165822779 -0.0444835190 0.0503534683
+-0.0624587207 0.0066718928 -0.0102211592
+0.0017313860 0.0134931635 -0.0282275033
+-0.0180148233 -0.0406687368 0.0318663205
+-0.0400899279 0.0539575020 0.0192601437
+-0.0033116474 0.0248749786 0.0252576452
+0.0095315261 -0.0174902215 0.0116278460
+-0.0262629243 0.0458160648 0.0517736188
+-0.0068670498 0.0046928578 -0.0445939858
+0.0036887414 -0.0542196122 0.0083812259
+0.0211560982 0.0006609579 -0.0259051228
+-0.0137182209 -0.0199504183 0.0048543241
+-0.0400582378 -0.0344445514 0.0492432962
+0.0145074696 -0.0075657662 -0.0025126150
+-0.0210135173 0.0066137335 0.0094962042
+0.0130028076 -0.0022930732 0.0156926842
+-0.0515800925 -0.0366551128 -0.0236325798
+-0.0112821589 -0.0279458941 -0.0157069650
+0.0019191380 0.0111061937 -0.0115045514
+-0.0049250105 -0.0230516425 0.0151820207
+0.0754666027 -0.0690680799 0.0304332442
+0.0022878062 0.0467488956 -0.0174763114
+0.0394530121 0.0329739530 0.0165460540
+0.0376601523 0.0515066697 -0.0772411131
+0.0740279573 0.0127696044 -0.0528532681
diff --git a/tests/04_FF/test.sum b/tests/04_FF/test.sum
new file mode 100644
index 00000000000..9a1d2eeb8a8
--- /dev/null
+++ b/tests/04_FF/test.sum
@@ -0,0 +1,21 @@
+01_LJ_Anderson 1
+02_LJ_Berendsen 1
+03_LJ_FIRE 1
+04_LJ_Langevin 1
+05_LJ_MSST 1
+06_LJ_NHC_NVT 1
+07_LJ_NPT_aniso_none 1
+08_LJ_NPT_aniso_xy 1
+09_LJ_NPT_aniso_xz 1
+10_LJ_NPT_aniso_yz 1
+11_LJ_NPT_iso 1
+12_LJ_NPT_tri 1
+13_LJ_NVE 1
+14_LJ_rescale_v 1
+15_LJ_rescaling 1
+16_LJ_RE_rule1 1
+17_LJ_CR_multi_ele 1
+18_LJ_single_rule2 1
+19_LJ_RE_stop 1
+20_LJ_dry_run 1
+
diff --git a/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/INPUT b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/INPUT
new file mode 100644
index 00000000000..7b498cf0ca4
--- /dev/null
+++ b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/INPUT
@@ -0,0 +1,23 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+nspin    2
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/KPT b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/STRU b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/STRU
new file mode 100644
index 00000000000..8535c1db16e
--- /dev/null
+++ b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.51   0.51   0.51   mag  -2.0
diff --git a/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/result.ref b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/result.ref
new file mode 100644
index 00000000000..83d1b45cfc4
--- /dev/null
+++ b/tests/17_DS_DFTU/01_LCAO_SPIN_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6787.961875326573
+etotperatomref -3393.9809376640
+totaltimeref 3.93
diff --git a/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/INPUT b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/INPUT
new file mode 100644
index 00000000000..163c7b3bcd6
--- /dev/null
+++ b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/INPUT
@@ -0,0 +1,20 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/KPT b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/STRU b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/STRU
new file mode 100644
index 00000000000..a96b8d1a0e3
--- /dev/null
+++ b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  1.155  1.155  1.155
+0.51   0.51   0.51   magmom  -1.155  -1.155  -1.155
diff --git a/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/result.ref b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/result.ref
new file mode 100644
index 00000000000..d1b31b18b36
--- /dev/null
+++ b/tests/17_DS_DFTU/02_LCAO_SPIN_S4_XYZ/result.ref
@@ -0,0 +1,3 @@
+etotref -6787.961880425138
+etotperatomref -3393.9809384824
+totaltimeref 8.90
diff --git a/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/INPUT b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/INPUT
new file mode 100644
index 00000000000..1eb50a84479
--- /dev/null
+++ b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/INPUT
@@ -0,0 +1,28 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+nspin    2
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/KPT b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/STRU b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/STRU
new file mode 100644
index 00000000000..8535c1db16e
--- /dev/null
+++ b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.51   0.51   0.51   mag  -2.0
diff --git a/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/result.ref b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/result.ref
new file mode 100644
index 00000000000..bb1d7aa5ce8
--- /dev/null
+++ b/tests/17_DS_DFTU/03_LCAO_DFTU_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6772.0999515218118177
+etotperatomref -3386.0499757609
+totaltimeref 6.11
diff --git a/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/INPUT b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/INPUT
new file mode 100644
index 00000000000..7daab2ff56e
--- /dev/null
+++ b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/INPUT
@@ -0,0 +1,28 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/KPT b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/STRU b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/STRU
new file mode 100644
index 00000000000..63c4d14399c
--- /dev/null
+++ b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0
diff --git a/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/result.ref b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/result.ref
new file mode 100644
index 00000000000..1a258220cfb
--- /dev/null
+++ b/tests/17_DS_DFTU/04_LCAO_DFTU_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6772.1004497577005168
+etotperatomref -3386.0500835053
+totaltimeref 10.33
diff --git a/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/INPUT b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/INPUT
new file mode 100644
index 00000000000..efb3db1a055
--- /dev/null
+++ b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/INPUT
@@ -0,0 +1,27 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/KPT b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/STRU b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/STRU
new file mode 100644
index 00000000000..a96b8d1a0e3
--- /dev/null
+++ b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  1.155  1.155  1.155
+0.51   0.51   0.51   magmom  -1.155  -1.155  -1.155
diff --git a/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/result.ref b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/result.ref
new file mode 100644
index 00000000000..e9c930d41ef
--- /dev/null
+++ b/tests/17_DS_DFTU/05_LCAO_DFTU_S4_XYZ/result.ref
@@ -0,0 +1,3 @@
+etotref -6772.1004562034922856
+etotperatomref -3386.0500833394
+totaltimeref 10.35
diff --git a/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/INPUT b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/INPUT
new file mode 100644
index 00000000000..567770e830b
--- /dev/null
+++ b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/INPUT
@@ -0,0 +1,20 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    50
+gamma_only    0
+nspin    2
+nbands    28
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+pseudo_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/KPT b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/STRU b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/STRU
new file mode 100644
index 00000000000..7d8feef3406
--- /dev/null
+++ b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/STRU
@@ -0,0 +1,18 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.51   0.51   0.51   mag  -2.0
diff --git a/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/result.ref b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/result.ref
new file mode 100644
index 00000000000..5a43c537250
--- /dev/null
+++ b/tests/17_DS_DFTU/06_PW_SPIN_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6807.727140777411
+etotperatomref -3403.8635703887
+totaltimeref 2.73
diff --git a/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/INPUT b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/INPUT
new file mode 100644
index 00000000000..f0efbfb4f01
--- /dev/null
+++ b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/INPUT
@@ -0,0 +1,21 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+kpar    1
+pseudo_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/KPT b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/KPT
new file mode 100644
index 00000000000..c289c0158aa
--- /dev/null
+++ b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/STRU b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/STRU
new file mode 100644
index 00000000000..d8ea895cf0b
--- /dev/null
+++ b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/STRU
@@ -0,0 +1,18 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  1.155  1.155  1.155
+0.51   0.51   0.51   magmom  -1.155  -1.155  -1.155
diff --git a/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/result.ref b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/result.ref
new file mode 100644
index 00000000000..c17d6b8de03
--- /dev/null
+++ b/tests/17_DS_DFTU/07_PW_SPIN_S4_XYZ/result.ref
@@ -0,0 +1,3 @@
+etotref -6350.021298529959
+etotperatomref -3175.0106492650
+totaltimeref 1.53
diff --git a/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/INPUT b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/INPUT
new file mode 100644
index 00000000000..88bcde220e8
--- /dev/null
+++ b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/INPUT
@@ -0,0 +1,29 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    50
+gamma_only    0
+device    cpu
+
+nspin    2
+nbands    28
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/KPT b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/STRU b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/STRU
new file mode 100644
index 00000000000..8535c1db16e
--- /dev/null
+++ b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.51   0.51   0.51   mag  -2.0
diff --git a/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/result.ref b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/result.ref
new file mode 100644
index 00000000000..f9ddfdd28af
--- /dev/null
+++ b/tests/17_DS_DFTU/08_PW_DFTU_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6792.3335167095001452
+etotperatomref -3396.1667583548
+totaltimeref 21.07
diff --git a/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/INPUT b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/INPUT
new file mode 100644
index 00000000000..5d19e1c0665
--- /dev/null
+++ b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/INPUT
@@ -0,0 +1,29 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+device    cpu
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+kpar    1
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/KPT b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/STRU b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/STRU
new file mode 100644
index 00000000000..63c4d14399c
--- /dev/null
+++ b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0
diff --git a/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/result.ref b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/result.ref
new file mode 100644
index 00000000000..8242af7627b
--- /dev/null
+++ b/tests/17_DS_DFTU/09_PW_DFTU_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6364.2658763901727070
+etotperatomref -3182.1329381951
+totaltimeref 7.82
diff --git a/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/INPUT b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/INPUT
new file mode 100644
index 00000000000..5ec0a0f0e53
--- /dev/null
+++ b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/INPUT
@@ -0,0 +1,29 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+nspin    2
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/KPT b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/STRU b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/STRU
new file mode 100644
index 00000000000..8535c1db16e
--- /dev/null
+++ b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.51   0.51   0.51   mag  -2.0
diff --git a/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/result.ref b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/result.ref
new file mode 100644
index 00000000000..ecc536b3d55
--- /dev/null
+++ b/tests/17_DS_DFTU/11_PW_DFTU_S2_FeO/result.ref
@@ -0,0 +1,3 @@
+etotref -6364.2654756626079688
+etotperatomref -3182.1327378313039844
+totaltimeref 2.04
diff --git a/tests/17_DS_DFTU/12_PW_DS_S2_Z/INPUT b/tests/17_DS_DFTU/12_PW_DS_S2_Z/INPUT
new file mode 100644
index 00000000000..28530e1f7f6
--- /dev/null
+++ b/tests/17_DS_DFTU/12_PW_DS_S2_Z/INPUT
@@ -0,0 +1,31 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+nspin    2
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/12_PW_DS_S2_Z/KPT b/tests/17_DS_DFTU/12_PW_DS_S2_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/12_PW_DS_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/12_PW_DS_S2_Z/STRU b/tests/17_DS_DFTU/12_PW_DS_S2_Z/STRU
new file mode 100644
index 00000000000..b942348be5d
--- /dev/null
+++ b/tests/17_DS_DFTU/12_PW_DS_S2_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   sc 1 1 1
+0.51   0.51   0.51   mag  -2.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/12_PW_DS_S2_Z/result.ref b/tests/17_DS_DFTU/12_PW_DS_S2_Z/result.ref
new file mode 100644
index 00000000000..9dd83d8e706
--- /dev/null
+++ b/tests/17_DS_DFTU/12_PW_DS_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6366.569118260046
+etotperatomref -3183.2845591300
+totaltimeref 1.97
diff --git a/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/INPUT b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/INPUT
new file mode 100644
index 00000000000..0d74fa0b60a
--- /dev/null
+++ b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/INPUT
@@ -0,0 +1,31 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/KPT b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/STRU b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/STRU
new file mode 100644
index 00000000000..0a9effad744
--- /dev/null
+++ b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  1.155  1.155  1.155  sc 1 1 1
+0.51   0.51   0.51   magmom  -1.155  -1.155  -1.155  sc 1 1 1
diff --git a/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/result.ref b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/result.ref
new file mode 100644
index 00000000000..f63986cceb7
--- /dev/null
+++ b/tests/17_DS_DFTU/14_PW_DS_S4_XYZ/result.ref
@@ -0,0 +1,3 @@
+etotref -6366.562922988214
+etotperatomref -3183.2814614941
+totaltimeref 4.23
diff --git a/tests/17_DS_DFTU/15_PW_DS_S4_Z/INPUT b/tests/17_DS_DFTU/15_PW_DS_S4_Z/INPUT
new file mode 100644
index 00000000000..a300c671979
--- /dev/null
+++ b/tests/17_DS_DFTU/15_PW_DS_S4_Z/INPUT
@@ -0,0 +1,31 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/15_PW_DS_S4_Z/KPT b/tests/17_DS_DFTU/15_PW_DS_S4_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/15_PW_DS_S4_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/15_PW_DS_S4_Z/STRU b/tests/17_DS_DFTU/15_PW_DS_S4_Z/STRU
new file mode 100644
index 00000000000..bbe4a2796fa
--- /dev/null
+++ b/tests/17_DS_DFTU/15_PW_DS_S4_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0  sc 1 1 1
+0.51   0.51   0.51   mag  -2.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/15_PW_DS_S4_Z/result.ref b/tests/17_DS_DFTU/15_PW_DS_S4_Z/result.ref
new file mode 100644
index 00000000000..07523240a6f
--- /dev/null
+++ b/tests/17_DS_DFTU/15_PW_DS_S4_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6366.562433916121
+etotperatomref -3183.2812169581
+totaltimeref 4.26
diff --git a/tests/17_DS_DFTU/16_PW_DS_S4_XY/INPUT b/tests/17_DS_DFTU/16_PW_DS_S4_XY/INPUT
new file mode 100644
index 00000000000..a300c671979
--- /dev/null
+++ b/tests/17_DS_DFTU/16_PW_DS_S4_XY/INPUT
@@ -0,0 +1,31 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/16_PW_DS_S4_XY/KPT b/tests/17_DS_DFTU/16_PW_DS_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/16_PW_DS_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/16_PW_DS_S4_XY/STRU b/tests/17_DS_DFTU/16_PW_DS_S4_XY/STRU
new file mode 100644
index 00000000000..1ffecf17384
--- /dev/null
+++ b/tests/17_DS_DFTU/16_PW_DS_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0  sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/16_PW_DS_S4_XY/result.ref b/tests/17_DS_DFTU/16_PW_DS_S4_XY/result.ref
new file mode 100644
index 00000000000..c2ed3287e21
--- /dev/null
+++ b/tests/17_DS_DFTU/16_PW_DS_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6366.562695059035
+etotperatomref -3183.2813475295
+totaltimeref 4.19
diff --git a/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/INPUT b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/INPUT
new file mode 100644
index 00000000000..f403abe54b4
--- /dev/null
+++ b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/INPUT
@@ -0,0 +1,39 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+nspin    2
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/KPT b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/STRU b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/STRU
new file mode 100644
index 00000000000..bbe4a2796fa
--- /dev/null
+++ b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0  sc 1 1 1
+0.51   0.51   0.51   mag  -2.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/result.ref b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/result.ref
new file mode 100644
index 00000000000..654cb15b3ee
--- /dev/null
+++ b/tests/17_DS_DFTU/18_PW_DFTU_DS_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6355.9855588350255857
+etotperatomref -3177.9927794175
+totaltimeref 2.89
diff --git a/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/INPUT b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/INPUT
new file mode 100644
index 00000000000..34cb4471478
--- /dev/null
+++ b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/INPUT
@@ -0,0 +1,39 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+device    cpu
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/KPT b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/STRU b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/STRU
new file mode 100644
index 00000000000..1ffecf17384
--- /dev/null
+++ b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0  sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/result.ref b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/result.ref
new file mode 100644
index 00000000000..ed294b04143
--- /dev/null
+++ b/tests/17_DS_DFTU/19_PW_DFTU_DS_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6355.9841673819892094
+etotperatomref -3177.9920836910
+totaltimeref 5.88
diff --git a/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/INPUT b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/INPUT
new file mode 100644
index 00000000000..a8de392596a
--- /dev/null
+++ b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/INPUT
@@ -0,0 +1,38 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    2
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/KPT b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/STRU b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/STRU
new file mode 100644
index 00000000000..bbe4a2796fa
--- /dev/null
+++ b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0  sc 1 1 1
+0.51   0.51   0.51   mag  -2.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/result.ref b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/result.ref
new file mode 100644
index 00000000000..1810d27088f
--- /dev/null
+++ b/tests/17_DS_DFTU/21_PW_DFTU_DS_S4_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6355.9834051938123594
+etotperatomref -3177.9917025969
+totaltimeref 5.46
diff --git a/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/INPUT b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/INPUT
new file mode 100644
index 00000000000..fe803afb672
--- /dev/null
+++ b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/INPUT
@@ -0,0 +1,33 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+nspin    4
+noncolin 1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-3
+#sc_lambda_strategy    linear_scan
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/KPT b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/STRU b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/STRU
new file mode 100644
index 00000000000..cf31449a12b
--- /dev/null
+++ b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0  sc 1 1 1 lambda 1 1 1
+0.51   0.51   0.51   mag  -2.0  sc 1 1 1 lambda 1 1 1
diff --git a/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/result.ref b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/result.ref
new file mode 100644
index 00000000000..3ea422eac92
--- /dev/null
+++ b/tests/17_DS_DFTU/24_LCAO_DS_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6777.701256736625
+etotperatomref -3388.8506283683
+totaltimeref 24.31
diff --git a/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/INPUT b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/INPUT
new file mode 100644
index 00000000000..b2b6ce9c8d7
--- /dev/null
+++ b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/INPUT
@@ -0,0 +1,32 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-2
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+sc_lambda_strategy    linear_response
diff --git a/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/KPT b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/STRU b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/STRU
new file mode 100644
index 00000000000..928faa5b29c
--- /dev/null
+++ b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  1.155  1.155  1.155 sc 1 1 1
+0.51   0.51   0.51   magmom  -1.155  -1.155  -1.155 sc 1 1 1
diff --git a/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/result.ref b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/result.ref
new file mode 100644
index 00000000000..2ffe77f0f0b
--- /dev/null
+++ b/tests/17_DS_DFTU/26_LCAO_DS_S4_XYZ/result.ref
@@ -0,0 +1,3 @@
+etotref -6777.701352737945
+etotperatomref -3388.8506763690
+totaltimeref 31.91
diff --git a/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/INPUT b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/INPUT
new file mode 100644
index 00000000000..4797cb91fcf
--- /dev/null
+++ b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/INPUT
@@ -0,0 +1,30 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-2
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/KPT b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/STRU b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/STRU
new file mode 100644
index 00000000000..b6c219f04a6
--- /dev/null
+++ b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  0.0  0.0  2.0 sc 0 0 1
+0.51   0.51   0.51   magmom  0.0  0.0  -2.0 sc 0 0 1
diff --git a/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/result.ref b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/result.ref
new file mode 100644
index 00000000000..820480722b9
--- /dev/null
+++ b/tests/17_DS_DFTU/27_LCAO_DS_S4_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6777.700905394034
+etotperatomref -3388.8504526970
+totaltimeref 30.38
diff --git a/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/INPUT b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/INPUT
new file mode 100644
index 00000000000..4797cb91fcf
--- /dev/null
+++ b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/INPUT
@@ -0,0 +1,30 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-2
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/KPT b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/STRU b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/STRU
new file mode 100644
index 00000000000..ac54252cab0
--- /dev/null
+++ b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0 sc 1 1 0
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0 sc 1 1 0
diff --git a/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/result.ref b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/result.ref
new file mode 100644
index 00000000000..897f0f8b2e0
--- /dev/null
+++ b/tests/17_DS_DFTU/28_LCAO_DS_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6777.701049634186
+etotperatomref -3388.8505248171
+totaltimeref 31.96
diff --git a/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/INPUT b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/INPUT
new file mode 100644
index 00000000000..0d8d5a0c3f1
--- /dev/null
+++ b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/INPUT
@@ -0,0 +1,36 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+nspin    2
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-3
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/KPT b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/STRU b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/STRU
new file mode 100644
index 00000000000..8535c1db16e
--- /dev/null
+++ b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.51   0.51   0.51   mag  -2.0
diff --git a/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/result.ref b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/result.ref
new file mode 100644
index 00000000000..4ca0323afc6
--- /dev/null
+++ b/tests/17_DS_DFTU/30_LCAO_DFTU_DS_S2_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6772.1000709242498488
+etotperatomref -3386.0500837373
+totaltimeref 6.08
diff --git a/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/INPUT b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/INPUT
new file mode 100644
index 00000000000..5312a11245b
--- /dev/null
+++ b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/INPUT
@@ -0,0 +1,36 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-3
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/KPT b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/STRU b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/STRU
new file mode 100644
index 00000000000..ac54252cab0
--- /dev/null
+++ b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0 sc 1 1 0
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0 sc 1 1 0
diff --git a/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/result.ref b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/result.ref
new file mode 100644
index 00000000000..1c22701c6f4
--- /dev/null
+++ b/tests/17_DS_DFTU/31_LCAO_DFTU_DS_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6770.7755674724503479
+etotperatomref -3385.3877837362
+totaltimeref 21.27
diff --git a/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/INPUT b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/INPUT
new file mode 100644
index 00000000000..19873fdd2dc
--- /dev/null
+++ b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/INPUT
@@ -0,0 +1,36 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-2
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/KPT b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/STRU b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/STRU
new file mode 100644
index 00000000000..928faa5b29c
--- /dev/null
+++ b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  1.155  1.155  1.155 sc 1 1 1
+0.51   0.51   0.51   magmom  -1.155  -1.155  -1.155 sc 1 1 1
diff --git a/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/result.ref b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/result.ref
new file mode 100644
index 00000000000..c3db185c468
--- /dev/null
+++ b/tests/17_DS_DFTU/32_LCAO_DFTU_DS_S4_XYZ/result.ref
@@ -0,0 +1,3 @@
+etotref -6770.9783920605041203
+etotperatomref -3385.4891960303
+totaltimeref 44.54
diff --git a/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/INPUT b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/INPUT
new file mode 100644
index 00000000000..092d43abcb7
--- /dev/null
+++ b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/INPUT
@@ -0,0 +1,35 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-3
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/KPT b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/STRU b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/STRU
new file mode 100644
index 00000000000..b6c219f04a6
--- /dev/null
+++ b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  0.0  0.0  2.0 sc 0 0 1
+0.51   0.51   0.51   magmom  0.0  0.0  -2.0 sc 0 0 1
diff --git a/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/result.ref b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/result.ref
new file mode 100644
index 00000000000..16711d4360e
--- /dev/null
+++ b/tests/17_DS_DFTU/33_LCAO_DFTU_DS_S4_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6770.7660917255634558
+etotperatomref -3385.3830458628
+totaltimeref 19.80
diff --git a/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/INPUT b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/INPUT
new file mode 100644
index 00000000000..0ff2ef9a1a7
--- /dev/null
+++ b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/INPUT
@@ -0,0 +1,33 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    0
+nspin    2
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    1
+
+# DeltaSpin parameters -- nsc=1: read lambda only, skip iterative optimization
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    1
+nsc_min    1
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/KPT b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/STRU b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/STRU
new file mode 100644
index 00000000000..115ded29104
--- /dev/null
+++ b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0  sc 1 1 1
+0.51   0.51   0.51   mag -2.0   0.0   0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/result.ref b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/result.ref
new file mode 100644
index 00000000000..cb8011ebea5
--- /dev/null
+++ b/tests/17_DS_DFTU/36_PW_DS_S2_ReadLam_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6368.964006946522
+etotperatomref -3184.4820034733
+totaltimeref 4.42
diff --git a/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/INPUT b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/INPUT
new file mode 100644
index 00000000000..b5def492a60
--- /dev/null
+++ b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/INPUT
@@ -0,0 +1,32 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    2
+
+# DeltaSpin parameters -- nsc=1: read lambda only, skip iterative optimization
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    1
+nsc_min    1
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/KPT b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/STRU b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/STRU
new file mode 100644
index 00000000000..115ded29104
--- /dev/null
+++ b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0  sc 1 1 1
+0.51   0.51   0.51   mag -2.0   0.0   0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/result.ref b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/result.ref
new file mode 100644
index 00000000000..8f1a3fb245c
--- /dev/null
+++ b/tests/17_DS_DFTU/37_PW_DS_S4_ReadLam_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6370.632169015013
+etotperatomref -3185.3160845075
+totaltimeref 4.29
diff --git a/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/INPUT b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/INPUT
new file mode 100644
index 00000000000..544b67acd29
--- /dev/null
+++ b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/INPUT
@@ -0,0 +1,33 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    0
+nspin    2
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    1
+
+# DeltaSpin -- strict convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/KPT b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/STRU b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/STRU
new file mode 100644
index 00000000000..b43039501d3
--- /dev/null
+++ b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0
+0.51   0.51   0.51   mag -2.0   0.0   0.0
diff --git a/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/result.ref b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/result.ref
new file mode 100644
index 00000000000..58e32cd1c0d
--- /dev/null
+++ b/tests/17_DS_DFTU/38_PW_DS_S2_Thr1e10_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6368.964006945744
+etotperatomref -3184.4820034729
+totaltimeref 6.33
diff --git a/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/INPUT b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/INPUT
new file mode 100644
index 00000000000..adac5688d14
--- /dev/null
+++ b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/INPUT
@@ -0,0 +1,32 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    2
+
+# DeltaSpin -- strict convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/KPT b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/STRU b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/STRU
new file mode 100644
index 00000000000..b43039501d3
--- /dev/null
+++ b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0
+0.51   0.51   0.51   mag -2.0   0.0   0.0
diff --git a/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/result.ref b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/result.ref
new file mode 100644
index 00000000000..8507c130334
--- /dev/null
+++ b/tests/17_DS_DFTU/39_PW_DS_S4_Thr1e10_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6370.632169015102
+etotperatomref -3185.3160845076
+totaltimeref 3.72
diff --git a/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/INPUT b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/INPUT
new file mode 100644
index 00000000000..18df498a747
--- /dev/null
+++ b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/INPUT
@@ -0,0 +1,34 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    0
+nspin    2
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+out_alllog    1
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    1
+
+# DeltaSpin -- loose convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/KPT b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/STRU b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/STRU
new file mode 100644
index 00000000000..b43039501d3
--- /dev/null
+++ b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0
+0.51   0.51   0.51   mag -2.0   0.0   0.0
diff --git a/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/result.ref b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/result.ref
new file mode 100644
index 00000000000..05c80f4f708
--- /dev/null
+++ b/tests/17_DS_DFTU/40_PW_DS_S2_Thr10_Z/result.ref
@@ -0,0 +1,4 @@
+etotref -6368.964006946539
+etotperatomref -3184.4820034733
+log_filename_validation 1
+totaltimeref 3.74
diff --git a/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/INPUT b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/INPUT
new file mode 100644
index 00000000000..38276dc8689
--- /dev/null
+++ b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/INPUT
@@ -0,0 +1,32 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    2
+
+# DeltaSpin -- loose convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    0.1
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/KPT b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/STRU b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/STRU
new file mode 100644
index 00000000000..115ded29104
--- /dev/null
+++ b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0  sc 1 1 1
+0.51   0.51   0.51   mag -2.0   0.0   0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/result.ref b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/result.ref
new file mode 100644
index 00000000000..acf31e682c6
--- /dev/null
+++ b/tests/17_DS_DFTU/41_PW_DS_S4_Thr10_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6366.564253345298
+etotperatomref -3183.2821266726
+totaltimeref 7.03
diff --git a/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/INPUT b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/INPUT
new file mode 100644
index 00000000000..e7bfa8339a9
--- /dev/null
+++ b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/INPUT
@@ -0,0 +1,39 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    0
+nspin    2
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    1
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin -- strict convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/KPT b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/STRU b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/STRU
new file mode 100644
index 00000000000..b43039501d3
--- /dev/null
+++ b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0
+0.51   0.51   0.51   mag -2.0   0.0   0.0
diff --git a/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/result.ref b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/result.ref
new file mode 100644
index 00000000000..016eee221e3
--- /dev/null
+++ b/tests/17_DS_DFTU/42_PW_DFTU_DS_S2_Thr1e10_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6363.8892809126737120
+etotperatomref -3181.9446404562
+totaltimeref 2.86
diff --git a/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/INPUT b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/INPUT
new file mode 100644
index 00000000000..4629ed7b775
--- /dev/null
+++ b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/INPUT
@@ -0,0 +1,38 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    2
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin -- strict convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/KPT b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/STRU b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/STRU
new file mode 100644
index 00000000000..b43039501d3
--- /dev/null
+++ b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0
+0.51   0.51   0.51   mag -2.0   0.0   0.0
diff --git a/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/result.ref b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/result.ref
new file mode 100644
index 00000000000..fc8f809a7dd
--- /dev/null
+++ b/tests/17_DS_DFTU/43_PW_DFTU_DS_S4_Thr1e10_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6364.2658763871213523
+etotperatomref -3182.1329381935606761
+totaltimeref 3.0
diff --git a/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/INPUT b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/INPUT
new file mode 100644
index 00000000000..f5b277d5766
--- /dev/null
+++ b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/INPUT
@@ -0,0 +1,39 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    0
+nspin    2
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    1
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin -- loose convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/KPT b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/STRU b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/STRU
new file mode 100644
index 00000000000..115ded29104
--- /dev/null
+++ b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0  sc 1 1 1
+0.51   0.51   0.51   mag -2.0   0.0   0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/result.ref b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/result.ref
new file mode 100644
index 00000000000..f6a9e0e5326
--- /dev/null
+++ b/tests/17_DS_DFTU/44_PW_DFTU_DS_S2_Thr10_Z/result.ref
@@ -0,0 +1,3 @@
+etotref -6358.1300039981460941
+etotperatomref -3179.0650019991
+totaltimeref 11.19
diff --git a/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/INPUT b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/INPUT
new file mode 100644
index 00000000000..bd4a2bedb7d
--- /dev/null
+++ b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/INPUT
@@ -0,0 +1,38 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+kpar    2
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin -- loose convergence threshold
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    0.1
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/KPT b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/STRU b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/STRU
new file mode 100644
index 00000000000..115ded29104
--- /dev/null
+++ b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0   0.0   0.0  sc 1 1 1
+0.51   0.51   0.51   mag -2.0   0.0   0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/result.ref b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/result.ref
new file mode 100644
index 00000000000..14b18ac3fd4
--- /dev/null
+++ b/tests/17_DS_DFTU/45_PW_DFTU_DS_S4_Thr10_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6355.9593396534382919
+etotperatomref -3177.9796698267
+totaltimeref 9.42
diff --git a/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/INPUT b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/INPUT
new file mode 100644
index 00000000000..3fad36b4e06
--- /dev/null
+++ b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/INPUT
@@ -0,0 +1,27 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    50
+gamma_only    0
+
+nspin    2
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    -1 2
+hubbard_u    0 5.0
+onsite_radius   3.0
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
\ No newline at end of file
diff --git a/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/KPT b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/KPT
new file mode 100644
index 00000000000..1f26d6a8a39
--- /dev/null
+++ b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
\ No newline at end of file
diff --git a/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/STRU b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/STRU
new file mode 100644
index 00000000000..5d4d02a4e1b
--- /dev/null
+++ b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/STRU
@@ -0,0 +1,29 @@
+ATOMIC_SPECIES
+O 1.000 O.upf
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+8_O_gga_100Ry_7au_2s2p1d.orb
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+O
+0.0
+2
+0.50   0.00   0.00
+0.00   0.50   0.50
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.50   0.50   0.50   mag  -2.0
\ No newline at end of file
diff --git a/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/result.ref b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/result.ref
new file mode 100644
index 00000000000..380a0b7b372
--- /dev/null
+++ b/tests/17_DS_DFTU/50_FeO_O_first_Fe_second/result.ref
@@ -0,0 +1,3 @@
+etotref -7652.39578275317
+etotperatomref -1913.0989456883
+totaltimeref 2
\ No newline at end of file
diff --git a/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/INPUT b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/INPUT
new file mode 100644
index 00000000000..ef007293197
--- /dev/null
+++ b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/INPUT
@@ -0,0 +1,27 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    50
+gamma_only    0
+
+nspin    2
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2 -1
+hubbard_u    5.0 0
+onsite_radius   3.0
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
\ No newline at end of file
diff --git a/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/KPT b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/KPT
new file mode 100644
index 00000000000..1f26d6a8a39
--- /dev/null
+++ b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
\ No newline at end of file
diff --git a/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/STRU b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/STRU
new file mode 100644
index 00000000000..d18fb1e38fb
--- /dev/null
+++ b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/STRU
@@ -0,0 +1,29 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+O 1.000 O.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+8_O_gga_100Ry_7au_2s2p1d.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   mag  2.0
+0.50   0.50   0.50   mag  -2.0
+
+O
+0.0
+2
+0.50   0.00   0.00
+0.00   0.50   0.50
diff --git a/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/result.ref b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/result.ref
new file mode 100644
index 00000000000..380a0b7b372
--- /dev/null
+++ b/tests/17_DS_DFTU/51_FeO_Fe_first_O_second/result.ref
@@ -0,0 +1,3 @@
+etotref -7652.39578275317
+etotperatomref -1913.0989456883
+totaltimeref 2
\ No newline at end of file
diff --git a/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/INPUT b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/INPUT
new file mode 100644
index 00000000000..23afec5db2a
--- /dev/null
+++ b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/INPUT
@@ -0,0 +1,33 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    nscf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+init_chg    file
+read_file_dir    ./
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    1
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    1
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/KPT b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/STRU b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/STRU
new file mode 100644
index 00000000000..1ffecf17384
--- /dev/null
+++ b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0  sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/autotest-CHARGE-DENSITY.restart b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/autotest-CHARGE-DENSITY.restart
new file mode 100644
index 00000000000..d0f08d99cae
Binary files /dev/null and b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/autotest-CHARGE-DENSITY.restart differ
diff --git a/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/result.ref b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/result.ref
new file mode 100644
index 00000000000..fc63371aa3b
--- /dev/null
+++ b/tests/17_DS_DFTU/55_PW_DS_NSCF_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6369.751780117049
+etotperatomref -3184.875890058524
+totaltimeref 2
diff --git a/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/INPUT b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/INPUT
new file mode 100644
index 00000000000..417f4c65bdf
--- /dev/null
+++ b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/INPUT
@@ -0,0 +1,32 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DeltaSpin parameters - direction only mode
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+sc_direction_only    1
+
+kpar    1
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/KPT b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/STRU b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/STRU
new file mode 100644
index 00000000000..1ffecf17384
--- /dev/null
+++ b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0  sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/result.ref b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/result.ref
new file mode 100644
index 00000000000..a834063a0df
--- /dev/null
+++ b/tests/17_DS_DFTU/56_PW_DS_S4_DirectionOnly_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6370.63216907371
+etotperatomref -3185.3160845369
+totaltimeref 4.94
diff --git a/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/INPUT b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/INPUT
new file mode 100644
index 00000000000..becbb440c6a
--- /dev/null
+++ b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/INPUT
@@ -0,0 +1,39 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters - direction only mode
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+sc_direction_only    1
+
+kpar    1
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+
+pw_seed 1
diff --git a/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/KPT b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/STRU b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/STRU
new file mode 100644
index 00000000000..1ffecf17384
--- /dev/null
+++ b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0  sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/result.ref b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/result.ref
new file mode 100644
index 00000000000..8179ccd1a53
--- /dev/null
+++ b/tests/17_DS_DFTU/57_PW_DFTU_DS_S4_DirectionOnly_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6364.2658688980172883
+etotperatomref -3182.1329344490
+totaltimeref 5.72
diff --git a/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/INPUT b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/INPUT
new file mode 100644
index 00000000000..c66793eac2d
--- /dev/null
+++ b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/INPUT
@@ -0,0 +1,31 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DeltaSpin parameters - direction only mode
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-2
+sc_direction_only    1
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/KPT b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/STRU b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/STRU
new file mode 100644
index 00000000000..17f53a6dcde
--- /dev/null
+++ b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0 sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0 sc 1 1 1
diff --git a/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/result.ref b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/result.ref
new file mode 100644
index 00000000000..108efd3e98c
--- /dev/null
+++ b/tests/17_DS_DFTU/58_LCAO_DS_S4_DirectionOnly_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6777.833281398763
+etotperatomref -3388.9166406994
+totaltimeref 30.31
diff --git a/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/INPUT b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/INPUT
new file mode 100644
index 00000000000..81ea04ea848
--- /dev/null
+++ b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/INPUT
@@ -0,0 +1,37 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    50
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters - direction only mode
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    1e-3
+sc_direction_only    1
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/KPT b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/STRU b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/STRU
new file mode 100644
index 00000000000..63c4d14399c
--- /dev/null
+++ b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0
diff --git a/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/result.ref b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/result.ref
new file mode 100644
index 00000000000..032eb7f214a
--- /dev/null
+++ b/tests/17_DS_DFTU/59_LCAO_DFTU_DS_S4_DirectionOnly_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6366.562695032056
+etotperatomref -3183.2813475160
+totaltimeref 15.00
diff --git a/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/INPUT b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/INPUT
new file mode 100644
index 00000000000..0222dc1abab
--- /dev/null
+++ b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/INPUT
@@ -0,0 +1,40 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    nscf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+init_chg    file
+read_file_dir    ./
+noncolin    1
+nbands    40
+scf_thr    1.0e-6
+scf_nmax    1
+out_chg    0
+out_band    1
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+kpar    1
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/KPT b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/STRU b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/STRU
new file mode 100644
index 00000000000..1ffecf17384
--- /dev/null
+++ b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0  sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0  sc 1 1 1
diff --git a/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart
new file mode 100644
index 00000000000..dbfc6545c61
Binary files /dev/null and b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart differ
diff --git a/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/band.txt.ref b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/band.txt.ref
new file mode 100644
index 00000000000..3c1e0edcc6b
--- /dev/null
+++ b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/band.txt.ref
@@ -0,0 +1,4 @@
+   1 0.00000000 -118.74659335 -118.74105316 -117.71243266 -117.71010697 -60.43942922 -60.43198848 -59.24890926 -59.24855148 -59.24621462 -59.24613546 -58.51271748 -58.50555152 -56.29487791 -56.29481572 -56.29287034 -56.29258877 2.64311620 2.64973398 5.67083421 5.67480330 9.99068874 10.00294181 10.18234765 10.18239223 10.18678632 10.18683943 10.24485869 10.24491226 10.24950719 10.24957186 11.34253572 11.35559667 13.07230231 13.07234186 13.07368595 13.07377607 13.58972453 13.58976572 13.59991270 13.60001804
+   2 0.82915620 -118.80368460 -118.79809980 -117.78944153 -117.78708036 -62.05046219 -62.04806635 -60.99915730 -60.99736002 -60.04357408 -60.04053523 -59.52262037 -59.51990186 -58.70518485 -58.70103313 -58.67371124 -58.67288114 6.45311232 6.45926062 7.48320213 7.49000622 8.79761027 8.80043858 9.56020356 9.56551933 9.72542883 9.73498425 9.95285811 9.95607541 10.54676401 10.55014375 10.86623027 10.87447067 11.30366212 11.31071686 12.83052430 12.83751385 12.90110713 12.90426321 13.11638182 13.12544514
+   3 2.24336976 -118.80368557 -118.79809951 -117.78944186 -117.78707950 -62.05008134 -62.04820681 -60.99956034 -60.99721634 -60.04352720 -60.04067929 -59.52259954 -59.52009612 -58.70517255 -58.70103915 -58.67376328 -58.67253821 6.45284517 6.45938177 7.48308029 7.49012123 8.79754168 8.80064706 9.56020632 9.56547628 9.72526202 9.73499741 9.95290081 9.95603601 10.54660460 10.55060293 10.86652043 10.87441346 11.30386626 11.31086260 12.83094329 12.83650890 12.90109634 12.90430775 13.11643926 13.12540010
+   4 3.65758332 -118.80368458 -118.79809967 -117.78944166 -117.78708048 -62.05040961 -62.04811733 -60.99922575 -60.99730699 -60.04353542 -60.04055074 -59.52257959 -59.51993402 -58.70519462 -58.70104660 -58.67372710 -58.67285963 6.45311497 6.45926327 7.48315175 7.49007185 8.79740942 8.80050481 9.56022290 9.56550773 9.72549108 9.73513556 9.95287148 9.95604062 10.54670755 10.55025149 10.86628870 10.87433312 11.30364117 11.31061114 12.83069521 12.83750463 12.90111654 12.90429728 13.11641208 13.12538266
diff --git a/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/onsite.dm b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/onsite.dm
new file mode 100644
index 00000000000..ba9367a6c2d
--- /dev/null
+++ b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/onsite.dm
@@ -0,0 +1,26 @@
+atoms  0
+L  2
+zeta  0
+  1.07284172  0.01795797  0.01795797  1.29498054  0.01795797  0.25781967  0.00000000  0.03110411  0.03591594 -0.25781967
+  0.00001170  0.00000304  0.00000304  0.00001265  0.00000304  0.00000743 -0.00000000  0.00000526  0.00000608 -0.00000743
+  0.01795797  0.00000000  0.25781967  0.03110411  1.29498054 -0.03110411 -0.03110411  1.07284172 -0.25781967  0.00000000
+  0.00000304 -0.00000000  0.00000743  0.00000526  0.00001265 -0.00000526 -0.00000526  0.00001170 -0.00000743  0.00000000
+  0.03591594  0.80832072 -0.25781967 -0.01014329 -0.25781967 -0.01014329  0.00000000 -0.00000000  1.29498054 -0.02028657
+  0.00000608 -0.00006150 -0.00000743  0.00000679 -0.00000743  0.00000679  0.00000000 -0.00000000  0.00001265  0.00001359
+ -0.01014329 -0.01014329  0.58933746 -0.26117797 -0.26117797  0.58933746 -0.01756869  0.01756869  0.26117797  0.26117797
+  0.00000679  0.00000679 -0.00001181  0.00000258  0.00000258 -0.00001181  0.00001177 -0.00001177 -0.00000258 -0.00000258
+ -0.00000000 -0.02028657 -0.01756869  0.26117797  0.01756869  0.26117797  0.80832072 -0.00000000 -0.00000000  0.58933746
+ -0.00000000  0.00001359  0.00001177 -0.00000258 -0.00001177 -0.00000258 -0.00006150 -0.00000000 -0.00000000 -0.00001181
+atoms  1
+L  2
+zeta  0
+  1.07330401  0.01806557  0.01806557  1.29470827  0.01806557  0.25729798  0.00000000  0.03129048  0.03613113 -0.25729798
+ -0.00000491 -0.00000164 -0.00000164  0.00000736 -0.00000164 -0.00000999  0.00000000 -0.00000284 -0.00000328  0.00000999
+  0.01806557  0.00000000  0.25729798  0.03129048  1.29470827 -0.03129048 -0.03129048  1.07330401 -0.25729798 -0.00000000
+ -0.00000164  0.00000000 -0.00000999 -0.00000284  0.00000736  0.00000284  0.00000284 -0.00000491  0.00000999 -0.00000000
+  0.03613113 -0.80780604 -0.25729798  0.01023653 -0.25729798  0.01023653 -0.00000000  0.00000000  1.29470827  0.02047305
+ -0.00000328 -0.00003287  0.00000999 -0.00000948  0.00000999 -0.00000948 -0.00000000  0.00000000  0.00000736 -0.00001895
+  0.01023653  0.01023653 -0.58946235  0.26074301  0.26074301 -0.58946235  0.01773018 -0.01773018 -0.26074301 -0.26074301
+ -0.00000948 -0.00000948 -0.00003291  0.00002481  0.00002481 -0.00003291 -0.00001641  0.00001641 -0.00002481 -0.00002481
+  0.00000000  0.02047305  0.01773018 -0.26074301 -0.01773018 -0.26074301 -0.80780604 -0.00000000 -0.00000000 -0.58946235
+  0.00000000 -0.00001895 -0.00001641 -0.00002481  0.00001641 -0.00002481 -0.00003287  0.00000000  0.00000000 -0.00003291
diff --git a/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/result.ref b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/result.ref
new file mode 100644
index 00000000000..ff360e3c3e1
--- /dev/null
+++ b/tests/17_DS_DFTU/60_PW_DFTU_DS_NSCF_Band_XY/result.ref
@@ -0,0 +1,4 @@
+etotref -6368.988301997129
+etotperatomref -3184.494150998564
+CompareBand_pass 0
+totaltimeref 1
diff --git a/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/INPUT b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/INPUT
new file mode 100644
index 00000000000..342edd1bc4a
--- /dev/null
+++ b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/INPUT
@@ -0,0 +1,31 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    nscf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+init_chg    file
+read_file_dir    ./
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    1
+out_chg    0
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/KPT b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/STRU b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/STRU
new file mode 100644
index 00000000000..17f53a6dcde
--- /dev/null
+++ b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0 sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0 sc 1 1 1
diff --git a/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/autotest-CHARGE-DENSITY.restart b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/autotest-CHARGE-DENSITY.restart
new file mode 100644
index 00000000000..6126ae30da4
Binary files /dev/null and b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/autotest-CHARGE-DENSITY.restart differ
diff --git a/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/result.ref b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/result.ref
new file mode 100644
index 00000000000..95b35a92c67
--- /dev/null
+++ b/tests/17_DS_DFTU/61_LCAO_DS_NSCF_S4_XY/result.ref
@@ -0,0 +1,3 @@
+etotref -6777.389070306773
+etotperatomref -3388.6945351534
+totaltimeref 0.38
diff --git a/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/INPUT b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/INPUT
new file mode 100644
index 00000000000..a550e0f2d34
--- /dev/null
+++ b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/INPUT
@@ -0,0 +1,29 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    scf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+init_chg    file
+read_file_dir    ./
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    100
+out_chg    0
+out_band    1
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    0
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/KPT b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/STRU b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/STRU
new file mode 100644
index 00000000000..63c4d14399c
--- /dev/null
+++ b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0
diff --git a/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart
new file mode 100644
index 00000000000..2c8f427bfa2
Binary files /dev/null and b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart differ
diff --git a/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/band.txt.ref b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/band.txt.ref
new file mode 100644
index 00000000000..766c6af2f2f
--- /dev/null
+++ b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/band.txt.ref
@@ -0,0 +1,4 @@
+   1 0.00000000 -81.35798613 -80.91162162 -77.87936703 -76.74784146 -48.21252989 -48.05180379 -47.11309954 -46.84624262 -45.99559708 -45.16174878 -44.25253962 -43.80961217 -43.32862796 -42.90421318 -42.51977044 -41.41638129 3.16867028 3.79557125 5.13494797 6.15577863 6.82828647 7.21801664 7.31150311 7.82216709 8.56222439 9.61800696 10.07332581 10.82198572 11.58689426 11.83812833 13.07342263 13.47914134 13.79924038 14.15748878 14.80880459 15.92888769 16.24301547 16.95879333 17.97629169 18.31838815 21.96927100 22.22937015 28.39211767 28.58909741 29.18199554 29.39709073 31.68650488 32.21255741 35.79893828 35.81979282 35.92295112 35.96563548
+   2 0.82915620 -81.10175627 -80.77324879 -77.62066318 -76.50645981 -48.77161167 -48.39941899 -48.01019247 -46.86510462 -46.57884280 -45.80018866 -44.06569224 -43.75791263 -43.09085467 -43.00922971 -41.90296097 -41.04758175 4.17006871 6.45579334 6.75816005 7.00768474 7.72582662 8.00853728 8.78685699 9.32009401 10.33266285 11.00473180 11.51204593 12.08031313 12.47631514 13.13935058 13.29522017 13.95792499 14.67217865 15.09735944 15.56928179 15.69679392 16.42789318 16.94993037 17.86411673 18.47145732 19.07698954 19.57788780 20.84435638 21.44693718 26.72714588 26.81209594 27.54363188 27.69151703 28.78218337 28.90236528 30.51378938 30.86720034
+   3 2.24336976 -81.81699214 -80.54300518 -78.16305079 -77.20106209 -49.17072128 -48.08020252 -47.60325656 -46.97267382 -45.84042501 -44.61889305 -43.85491252 -43.61774835 -43.16511756 -42.70935290 -42.38341026 -40.76879517 4.95741057 5.88906775 6.65766124 7.56916634 7.83189331 8.57746390 8.96955931 9.78306171 9.96996693 10.35343686 10.97546003 11.51761850 11.98974426 12.88830307 13.65557399 14.24765814 14.60487727 15.36245775 15.58947737 15.95114711 16.30844135 16.64852852 16.95966354 18.11732802 19.14383346 20.13884583 20.70359105 21.88059524 26.76868015 26.79258365 27.54297052 27.77900288 28.70454341 28.92890934 30.42018770 30.69387544
+   4 3.65758332 -81.81458357 -81.14041892 -77.46430087 -76.43986474 -48.54400095 -47.92984324 -47.70697073 -46.88389045 -46.05858194 -45.66647428 -43.99876504 -43.59683565 -43.15413976 -42.74725551 -42.27280545 -41.15469194 4.57328963 5.85587322 6.12395867 6.42651552 7.75146214 8.36714964 9.25467150 9.44556575 10.23255055 10.98354817 11.30357410 12.27001965 12.58833679 12.90746140 13.51369963 14.32839481 14.59049563 15.15343925 15.38851078 16.75044481 17.14681328 17.28677792 17.69380906 18.53898402 19.01591604 19.55929412 20.70329144 20.90676766 26.78639508 26.87324153 27.40984114 27.56127188 28.59165313 28.97716410 30.46267093 30.71056343
diff --git a/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/onsite.dm b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/onsite.dm
new file mode 100644
index 00000000000..ba9367a6c2d
--- /dev/null
+++ b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/onsite.dm
@@ -0,0 +1,26 @@
+atoms  0
+L  2
+zeta  0
+  1.07284172  0.01795797  0.01795797  1.29498054  0.01795797  0.25781967  0.00000000  0.03110411  0.03591594 -0.25781967
+  0.00001170  0.00000304  0.00000304  0.00001265  0.00000304  0.00000743 -0.00000000  0.00000526  0.00000608 -0.00000743
+  0.01795797  0.00000000  0.25781967  0.03110411  1.29498054 -0.03110411 -0.03110411  1.07284172 -0.25781967  0.00000000
+  0.00000304 -0.00000000  0.00000743  0.00000526  0.00001265 -0.00000526 -0.00000526  0.00001170 -0.00000743  0.00000000
+  0.03591594  0.80832072 -0.25781967 -0.01014329 -0.25781967 -0.01014329  0.00000000 -0.00000000  1.29498054 -0.02028657
+  0.00000608 -0.00006150 -0.00000743  0.00000679 -0.00000743  0.00000679  0.00000000 -0.00000000  0.00001265  0.00001359
+ -0.01014329 -0.01014329  0.58933746 -0.26117797 -0.26117797  0.58933746 -0.01756869  0.01756869  0.26117797  0.26117797
+  0.00000679  0.00000679 -0.00001181  0.00000258  0.00000258 -0.00001181  0.00001177 -0.00001177 -0.00000258 -0.00000258
+ -0.00000000 -0.02028657 -0.01756869  0.26117797  0.01756869  0.26117797  0.80832072 -0.00000000 -0.00000000  0.58933746
+ -0.00000000  0.00001359  0.00001177 -0.00000258 -0.00001177 -0.00000258 -0.00006150 -0.00000000 -0.00000000 -0.00001181
+atoms  1
+L  2
+zeta  0
+  1.07330401  0.01806557  0.01806557  1.29470827  0.01806557  0.25729798  0.00000000  0.03129048  0.03613113 -0.25729798
+ -0.00000491 -0.00000164 -0.00000164  0.00000736 -0.00000164 -0.00000999  0.00000000 -0.00000284 -0.00000328  0.00000999
+  0.01806557  0.00000000  0.25729798  0.03129048  1.29470827 -0.03129048 -0.03129048  1.07330401 -0.25729798 -0.00000000
+ -0.00000164  0.00000000 -0.00000999 -0.00000284  0.00000736  0.00000284  0.00000284 -0.00000491  0.00000999 -0.00000000
+  0.03613113 -0.80780604 -0.25729798  0.01023653 -0.25729798  0.01023653 -0.00000000  0.00000000  1.29470827  0.02047305
+ -0.00000328 -0.00003287  0.00000999 -0.00000948  0.00000999 -0.00000948 -0.00000000  0.00000000  0.00000736 -0.00001895
+  0.01023653  0.01023653 -0.58946235  0.26074301  0.26074301 -0.58946235  0.01773018 -0.01773018 -0.26074301 -0.26074301
+ -0.00000948 -0.00000948 -0.00003291  0.00002481  0.00002481 -0.00003291 -0.00001641  0.00001641 -0.00002481 -0.00002481
+  0.00000000  0.02047305  0.01773018 -0.26074301 -0.01773018 -0.26074301 -0.80780604 -0.00000000 -0.00000000 -0.58946235
+  0.00000000 -0.00001895 -0.00001641 -0.00002481  0.00001641 -0.00002481 -0.00003287  0.00000000  0.00000000 -0.00003291
diff --git a/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/result.ref b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/result.ref
new file mode 100644
index 00000000000..91669bb898c
--- /dev/null
+++ b/tests/17_DS_DFTU/62_LCAO_DFTU_NSCF_Band_XY/result.ref
@@ -0,0 +1,4 @@
+etotref -6765.43001420612
+etotperatomref -3382.71500710306
+CompareBand_pass 0
+totaltimeref 1
diff --git a/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/INPUT b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/INPUT
new file mode 100644
index 00000000000..e08e8f1d337
--- /dev/null
+++ b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/INPUT
@@ -0,0 +1,38 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    nscf
+basis_type    lcao
+ecutwfc    20
+gamma_only    0
+init_chg    file
+read_file_dir    ./
+noncolin    1
+#nbands    28
+scf_thr    1.0e-6
+scf_nmax    1
+out_chg    0
+out_band    1
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    genelpa
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+# DeltaSpin parameters
+sc_mag_switch    1
+sc_thr    1e-4
+nsc    100
+nsc_min    2
+alpha_trial    0.01
+sccut    3.0
+sc_scf_thr    10
+
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
diff --git a/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/KPT b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/STRU b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/STRU
new file mode 100644
index 00000000000..17f53a6dcde
--- /dev/null
+++ b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0 sc 1 1 1
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0 sc 1 1 1
diff --git a/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart
new file mode 100644
index 00000000000..2c8f427bfa2
Binary files /dev/null and b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart differ
diff --git a/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/band.txt.ref b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/band.txt.ref
new file mode 100644
index 00000000000..c082adc40b4
--- /dev/null
+++ b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/band.txt.ref
@@ -0,0 +1,4 @@
+   1 0.00000000 -81.35798613 -80.91162162 -77.87936703 -76.74784146 -48.21252989 -48.05180379 -47.11309954 -46.84624262 -45.99559708 -45.16174878 -44.25253962 -43.80961217 -43.32862796 -42.90421319 -42.51977044 -41.41638129 3.16867028 3.79557125 5.13494797 6.15577863 6.82828647 7.21801664 7.31150311 7.82216709 8.56222439 9.61800696 10.07332581 10.82198572 11.58689426 11.83812833 13.07342263 13.47914134 13.79924038 14.15748878 14.80880459 15.92888769 16.24301547 16.95879333 17.97629169 18.31838815 21.96927100 22.22937015 28.39211767 28.58909741 29.18199554 29.39709073 31.68650488 32.21255741 35.79893828 35.81979282 35.92295112 35.96563548
+   2 0.82915620 -81.10175627 -80.77324879 -77.62066318 -76.50645981 -48.77161167 -48.39941899 -48.01019247 -46.86510462 -46.57884280 -45.80018866 -44.06569224 -43.75791263 -43.09085467 -43.00922971 -41.90296097 -41.04758175 4.17006871 6.45579334 6.75816005 7.00768474 7.72582662 8.00853728 8.78685699 9.32009401 10.33266285 11.00473180 11.51204593 12.08031313 12.47631514 13.13935058 13.29522017 13.95792499 14.67217865 15.09735944 15.56928179 15.69679392 16.42789318 16.94993037 17.86411673 18.47145732 19.07698954 19.57788780 20.84435638 21.44693718 26.72714588 26.81209594 27.54363188 27.69151703 28.78218337 28.90236528 30.51378938 30.86720034
+   3 2.24336976 -81.81699214 -80.54300518 -78.16305079 -77.20106209 -49.17072128 -48.08020252 -47.60325656 -46.97267382 -45.84042501 -44.61889305 -43.85491252 -43.61774835 -43.16511756 -42.70935290 -42.38341026 -40.76879517 4.95741057 5.88906775 6.65766124 7.56916634 7.83189331 8.57746390 8.96955931 9.78306171 9.96996693 10.35343686 10.97546003 11.51761850 11.98974426 12.88830307 13.65557399 14.24765814 14.60487727 15.36245775 15.58947737 15.95114711 16.30844135 16.64852852 16.95966354 18.11732802 19.14383346 20.13884583 20.70359105 21.88059524 26.76868015 26.79258365 27.54297052 27.77900288 28.70454341 28.92890934 30.42018770 30.69387544
+   4 3.65758332 -81.81458357 -81.14041892 -77.46430087 -76.43986474 -48.54400095 -47.92984324 -47.70697073 -46.88389045 -46.05858194 -45.66647428 -43.99876504 -43.59683565 -43.15413976 -42.74725551 -42.27280545 -41.15469194 4.57328963 5.85587322 6.12395867 6.42651552 7.75146214 8.36714964 9.25467150 9.44556575 10.23255055 10.98354817 11.30357410 12.27001965 12.58833679 12.90746140 13.51369963 14.32839481 14.59049563 15.15343925 15.38851078 16.75044481 17.14681328 17.28677792 17.69380906 18.53898402 19.01591604 19.55929412 20.70329144 20.90676766 26.78639508 26.87324153 27.40984114 27.56127188 28.59165313 28.97716410 30.46267093 30.71056343
diff --git a/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/onsite.dm b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/onsite.dm
new file mode 100644
index 00000000000..ba9367a6c2d
--- /dev/null
+++ b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/onsite.dm
@@ -0,0 +1,26 @@
+atoms  0
+L  2
+zeta  0
+  1.07284172  0.01795797  0.01795797  1.29498054  0.01795797  0.25781967  0.00000000  0.03110411  0.03591594 -0.25781967
+  0.00001170  0.00000304  0.00000304  0.00001265  0.00000304  0.00000743 -0.00000000  0.00000526  0.00000608 -0.00000743
+  0.01795797  0.00000000  0.25781967  0.03110411  1.29498054 -0.03110411 -0.03110411  1.07284172 -0.25781967  0.00000000
+  0.00000304 -0.00000000  0.00000743  0.00000526  0.00001265 -0.00000526 -0.00000526  0.00001170 -0.00000743  0.00000000
+  0.03591594  0.80832072 -0.25781967 -0.01014329 -0.25781967 -0.01014329  0.00000000 -0.00000000  1.29498054 -0.02028657
+  0.00000608 -0.00006150 -0.00000743  0.00000679 -0.00000743  0.00000679  0.00000000 -0.00000000  0.00001265  0.00001359
+ -0.01014329 -0.01014329  0.58933746 -0.26117797 -0.26117797  0.58933746 -0.01756869  0.01756869  0.26117797  0.26117797
+  0.00000679  0.00000679 -0.00001181  0.00000258  0.00000258 -0.00001181  0.00001177 -0.00001177 -0.00000258 -0.00000258
+ -0.00000000 -0.02028657 -0.01756869  0.26117797  0.01756869  0.26117797  0.80832072 -0.00000000 -0.00000000  0.58933746
+ -0.00000000  0.00001359  0.00001177 -0.00000258 -0.00001177 -0.00000258 -0.00006150 -0.00000000 -0.00000000 -0.00001181
+atoms  1
+L  2
+zeta  0
+  1.07330401  0.01806557  0.01806557  1.29470827  0.01806557  0.25729798  0.00000000  0.03129048  0.03613113 -0.25729798
+ -0.00000491 -0.00000164 -0.00000164  0.00000736 -0.00000164 -0.00000999  0.00000000 -0.00000284 -0.00000328  0.00000999
+  0.01806557  0.00000000  0.25729798  0.03129048  1.29470827 -0.03129048 -0.03129048  1.07330401 -0.25729798 -0.00000000
+ -0.00000164  0.00000000 -0.00000999 -0.00000284  0.00000736  0.00000284  0.00000284 -0.00000491  0.00000999 -0.00000000
+  0.03613113 -0.80780604 -0.25729798  0.01023653 -0.25729798  0.01023653 -0.00000000  0.00000000  1.29470827  0.02047305
+ -0.00000328 -0.00003287  0.00000999 -0.00000948  0.00000999 -0.00000948 -0.00000000  0.00000000  0.00000736 -0.00001895
+  0.01023653  0.01023653 -0.58946235  0.26074301  0.26074301 -0.58946235  0.01773018 -0.01773018 -0.26074301 -0.26074301
+ -0.00000948 -0.00000948 -0.00003291  0.00002481  0.00002481 -0.00003291 -0.00001641  0.00001641 -0.00002481 -0.00002481
+  0.00000000  0.02047305  0.01773018 -0.26074301 -0.01773018 -0.26074301 -0.80780604 -0.00000000 -0.00000000 -0.58946235
+  0.00000000 -0.00001895 -0.00001641 -0.00002481  0.00001641 -0.00002481 -0.00003287  0.00000000  0.00000000 -0.00003291
diff --git a/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/result.ref b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/result.ref
new file mode 100644
index 00000000000..71ab30dfc69
--- /dev/null
+++ b/tests/17_DS_DFTU/63_LCAO_DFTU_DS_NSCF_Band_XY/result.ref
@@ -0,0 +1,4 @@
+etotref -6765.43001420620
+etotperatomref -3382.71500710310
+CompareBand_pass 0
+totaltimeref 2
diff --git a/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/INPUT b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/INPUT
new file mode 100644
index 00000000000..8e4de71a026
--- /dev/null
+++ b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/INPUT
@@ -0,0 +1,31 @@
+INPUT_PARAMETERS
+suffix    autotest
+calculation    nscf
+basis_type    pw
+ecutwfc    20
+gamma_only    0
+init_chg    file
+read_file_dir    ./
+noncolin    1
+#nbands    40
+scf_thr    1.0e-6
+scf_nmax    1
+out_chg    0
+out_band    1
+smearing_method    gaussian
+smearing_sigma    0.01
+mixing_type    broyden
+mixing_beta    0.4
+ks_solver    dav_subspace
+symmetry    0
+
+# DFT+U parameters
+dft_plus_u    1
+orbital_corr    2
+hubbard_u    5.0
+onsite_radius   3.0
+
+kpar    1
+pseudo_dir    ../../PP_ORB
+orbital_dir    ../../PP_ORB
+pw_seed 1
diff --git a/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/KPT b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/KPT
new file mode 100644
index 00000000000..35597cecff1
--- /dev/null
+++ b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Monkhorst-Pack
+2 2 2 0 0 0
diff --git a/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/STRU b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/STRU
new file mode 100644
index 00000000000..63c4d14399c
--- /dev/null
+++ b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/STRU
@@ -0,0 +1,21 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00   0.00   0.00   magmom  2.0  0.0  0.0
+0.51   0.51   0.51   magmom  -2.0  0.0  0.0
diff --git a/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart
new file mode 100644
index 00000000000..dbfc6545c61
Binary files /dev/null and b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/autotest-CHARGE-DENSITY.restart differ
diff --git a/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/band.txt.ref b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/band.txt.ref
new file mode 100644
index 00000000000..1ee8a962066
--- /dev/null
+++ b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/band.txt.ref
@@ -0,0 +1,4 @@
+   1 0.00000000 -118.74659335 -118.74105316 -117.71243266 -117.71010697 -60.43942922 -60.43198848 -59.24890926 -59.24855148 -59.24621462 -59.24613546 -58.51271748 -58.50555152 -56.29487791 -56.29481572 -56.29287034 -56.29258877 2.64311620 2.64973398 5.67083421 5.67480330 9.99068874 10.00294181 10.18234765 10.18239223 10.18678632 10.18683943 10.24485869 10.24491226 10.24950719 10.24957186 11.34253572 11.35559667 13.07230230 13.07234185 13.07368595 13.07377607 13.58972452 13.58976572 13.59991270 13.60001804 15.99826102 16.01361492 24.70650406 24.71753854 26.78032503 26.78045088 26.78898059 26.78905531 27.27406290 27.27939518 28.35680183 28.36332491
+   2 0.82915620 -118.80368460 -118.79809980 -117.78944153 -117.78708036 -62.05046219 -62.04806635 -60.99915730 -60.99736002 -60.04357408 -60.04053523 -59.52262037 -59.51990186 -58.70518485 -58.70103313 -58.67371124 -58.67288114 6.45311232 6.45926062 7.48320213 7.49000622 8.79761027 8.80043858 9.56020356 9.56551933 9.72542883 9.73498425 9.95285811 9.95607541 10.54676401 10.55014375 10.86623027 10.87447067 11.30366212 11.31071686 12.83052430 12.83751385 12.90110713 12.90426321 13.11638182 13.12544514 13.56207446 13.56742663 14.90063350 14.90600675 19.75546671 19.76418912 20.88342278 20.89209525 23.98781051 23.99591683 24.93117047 24.93542760
+   3 2.24336976 -118.80368557 -118.79809951 -117.78944186 -117.78707950 -62.05008134 -62.04820681 -60.99956034 -60.99721634 -60.04352720 -60.04067929 -59.52259954 -59.52009612 -58.70517255 -58.70103915 -58.67376328 -58.67253821 6.45284517 6.45938177 7.48308029 7.49012123 8.79754168 8.80064706 9.56020632 9.56547628 9.72526202 9.73499741 9.95290081 9.95603601 10.54660460 10.55060293 10.86652043 10.87441346 11.30386626 11.31086260 12.83094329 12.83650890 12.90109634 12.90430775 13.11643926 13.12540010 13.56232485 13.56698530 14.90069606 14.90592441 19.75546654 19.76418322 20.88340848 20.89209970 23.98781915 23.99591767 24.93114743 24.93544303
+   4 3.65758332 -118.80368458 -118.79809967 -117.78944166 -117.78708048 -62.05040961 -62.04811733 -60.99922575 -60.99730699 -60.04353542 -60.04055074 -59.52257959 -59.51993402 -58.70519462 -58.70104660 -58.67372710 -58.67285963 6.45311497 6.45926327 7.48315175 7.49007185 8.79740942 8.80050481 9.56022290 9.56550773 9.72549108 9.73513556 9.95287148 9.95604062 10.54670755 10.55025149 10.86628870 10.87433312 11.30364117 11.31061114 12.83069521 12.83750463 12.90111654 12.90429728 13.11641208 13.12538266 13.56209366 13.56729875 14.90066332 14.90597275 19.75549522 19.76417346 20.88344461 20.89207078 23.98776890 23.99594464 24.93117319 24.93542294
diff --git a/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/onsite.dm b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/onsite.dm
new file mode 100644
index 00000000000..ba9367a6c2d
--- /dev/null
+++ b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/onsite.dm
@@ -0,0 +1,26 @@
+atoms  0
+L  2
+zeta  0
+  1.07284172  0.01795797  0.01795797  1.29498054  0.01795797  0.25781967  0.00000000  0.03110411  0.03591594 -0.25781967
+  0.00001170  0.00000304  0.00000304  0.00001265  0.00000304  0.00000743 -0.00000000  0.00000526  0.00000608 -0.00000743
+  0.01795797  0.00000000  0.25781967  0.03110411  1.29498054 -0.03110411 -0.03110411  1.07284172 -0.25781967  0.00000000
+  0.00000304 -0.00000000  0.00000743  0.00000526  0.00001265 -0.00000526 -0.00000526  0.00001170 -0.00000743  0.00000000
+  0.03591594  0.80832072 -0.25781967 -0.01014329 -0.25781967 -0.01014329  0.00000000 -0.00000000  1.29498054 -0.02028657
+  0.00000608 -0.00006150 -0.00000743  0.00000679 -0.00000743  0.00000679  0.00000000 -0.00000000  0.00001265  0.00001359
+ -0.01014329 -0.01014329  0.58933746 -0.26117797 -0.26117797  0.58933746 -0.01756869  0.01756869  0.26117797  0.26117797
+  0.00000679  0.00000679 -0.00001181  0.00000258  0.00000258 -0.00001181  0.00001177 -0.00001177 -0.00000258 -0.00000258
+ -0.00000000 -0.02028657 -0.01756869  0.26117797  0.01756869  0.26117797  0.80832072 -0.00000000 -0.00000000  0.58933746
+ -0.00000000  0.00001359  0.00001177 -0.00000258 -0.00001177 -0.00000258 -0.00006150 -0.00000000 -0.00000000 -0.00001181
+atoms  1
+L  2
+zeta  0
+  1.07330401  0.01806557  0.01806557  1.29470827  0.01806557  0.25729798  0.00000000  0.03129048  0.03613113 -0.25729798
+ -0.00000491 -0.00000164 -0.00000164  0.00000736 -0.00000164 -0.00000999  0.00000000 -0.00000284 -0.00000328  0.00000999
+  0.01806557  0.00000000  0.25729798  0.03129048  1.29470827 -0.03129048 -0.03129048  1.07330401 -0.25729798 -0.00000000
+ -0.00000164  0.00000000 -0.00000999 -0.00000284  0.00000736  0.00000284  0.00000284 -0.00000491  0.00000999 -0.00000000
+  0.03613113 -0.80780604 -0.25729798  0.01023653 -0.25729798  0.01023653 -0.00000000  0.00000000  1.29470827  0.02047305
+ -0.00000328 -0.00003287  0.00000999 -0.00000948  0.00000999 -0.00000948 -0.00000000  0.00000000  0.00000736 -0.00001895
+  0.01023653  0.01023653 -0.58946235  0.26074301  0.26074301 -0.58946235  0.01773018 -0.01773018 -0.26074301 -0.26074301
+ -0.00000948 -0.00000948 -0.00003291  0.00002481  0.00002481 -0.00003291 -0.00001641  0.00001641 -0.00002481 -0.00002481
+  0.00000000  0.02047305  0.01773018 -0.26074301 -0.01773018 -0.26074301 -0.80780604 -0.00000000 -0.00000000 -0.58946235
+  0.00000000 -0.00001895 -0.00001641 -0.00002481  0.00001641 -0.00002481 -0.00003287  0.00000000  0.00000000 -0.00003291
diff --git a/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/result.ref b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/result.ref
new file mode 100644
index 00000000000..ad9ce9eeb7a
--- /dev/null
+++ b/tests/17_DS_DFTU/64_PW_DFTU_NSCF_Band_XY/result.ref
@@ -0,0 +1,4 @@
+etotref -6368.988302
+etotperatomref -3184.494151
+CompareBand_pass 0
+totaltimeref 2
diff --git a/tests/17_DS_DFTU/CASES_CPU.txt b/tests/17_DS_DFTU/CASES_CPU.txt
new file mode 100644
index 00000000000..51672d33743
--- /dev/null
+++ b/tests/17_DS_DFTU/CASES_CPU.txt
@@ -0,0 +1,60 @@
+# LCAO baseline tests
+01_LCAO_SPIN_S2_Z
+# 02_LCAO_SPIN_S4_XYZ
+03_LCAO_DFTU_S2_Z
+# 04_LCAO_DFTU_S4_XY
+# 05_LCAO_DFTU_S4_XYZ
+# PW baseline tests
+06_PW_SPIN_S2_Z
+07_PW_SPIN_S4_XYZ
+08_PW_DFTU_S2_Z
+09_PW_DFTU_S4_XY
+11_PW_DFTU_S2_FeO
+# PW pure DeltaSpin
+12_PW_DS_S2_Z
+14_PW_DS_S4_XYZ
+15_PW_DS_S4_Z
+16_PW_DS_S4_XY
+# PW DFT+U + DeltaSpin
+18_PW_DFTU_DS_S2_Z
+19_PW_DFTU_DS_S4_XY
+21_PW_DFTU_DS_S4_Z
+# LCAO pure DeltaSpin
+# 24_LCAO_DS_S2_Z
+# 26_LCAO_DS_S4_XYZ
+# 27_LCAO_DS_S4_Z
+# 28_LCAO_DS_S4_XY
+# LCAO DFT+U + DeltaSpin
+# 30_LCAO_DFTU_DS_S2_Z
+# 31_LCAO_DFTU_DS_S4_XY
+# 32_LCAO_DFTU_DS_S4_XYZ
+# 33_LCAO_DFTU_DS_S4_Z
+# ReadLam mode (nsc=1, load pre-computed lambda)
+36_PW_DS_S2_ReadLam_Z
+37_PW_DS_S4_ReadLam_XY
+# Lambda loop behavior: thr1e-10 = no lambda loop, load from STRU
+38_PW_DS_S2_Thr1e10_Z
+39_PW_DS_S4_Thr1e10_XY
+42_PW_DFTU_DS_S2_Thr1e10_Z
+43_PW_DFTU_DS_S4_Thr1e10_XY
+# Lambda loop behavior: thr10 = immediate lambda loop
+40_PW_DS_S2_Thr10_Z
+41_PW_DS_S4_Thr10_XY
+# 44_PW_DFTU_DS_S2_Thr10_Z
+45_PW_DFTU_DS_S4_Thr10_XY
+# FeO multi-element DFT+U
+50_FeO_O_first_Fe_second
+51_FeO_Fe_first_O_second
+# NSCF (non-self-consistent) tests
+# Note: DFT+U NSCF requires both charge density and onsite.dm files from prior SCF
+55_PW_DS_NSCF_S4_XY
+60_PW_DFTU_DS_NSCF_Band_XY
+61_LCAO_DS_NSCF_S4_XY
+# 62_LCAO_DFTU_NSCF_Band_XY
+# 63_LCAO_DFTU_DS_NSCF_Band_XY
+64_PW_DFTU_NSCF_Band_XY
+# DirectionOnly mode (constrain direction, not magnitude)
+56_PW_DS_S4_DirectionOnly_XY
+57_PW_DFTU_DS_S4_DirectionOnly_XY
+# 58_LCAO_DS_S4_DirectionOnly_XY
+# 59_LCAO_DFTU_DS_S4_DirectionOnly_XY
diff --git a/tests/17_DS_DFTU/CMakeLists.txt b/tests/17_DS_DFTU/CMakeLists.txt
new file mode 100644
index 00000000000..7c78260e772
--- /dev/null
+++ b/tests/17_DS_DFTU/CMakeLists.txt
@@ -0,0 +1,16 @@
+enable_testing()
+
+find_program(BASH bash)
+if(ENABLE_ASAN)
+    add_test(
+        NAME 17_DS_DFTU_test_with_asan
+        COMMAND ${BASH} ../integrate/Autotest.sh -a ${ABACUS_BIN_PATH} -n 2 -s true
+        WORKING_DIRECTORY ${ABACUS_TEST_DIR}/17_DS_DFTU
+    )
+else()
+    add_test(
+        NAME 17_DS_DFTU
+        COMMAND ${BASH} ../integrate/Autotest.sh -a ${ABACUS_BIN_PATH} -n 4
+        WORKING_DIRECTORY ${ABACUS_TEST_DIR}/17_DS_DFTU
+    )
+endif()
diff --git a/tests/17_DS_DFTU/README.md b/tests/17_DS_DFTU/README.md
new file mode 100644
index 00000000000..4a76a859865
--- /dev/null
+++ b/tests/17_DS_DFTU/README.md
@@ -0,0 +1,161 @@
+# 17_DS_DFTU — DeltaSpin & DFT+U Integration Test Suite
+
+This directory contains integration test cases for **DeltaSpin (spin-constrained DFT)** and **DFT+U** functionality in ABACUS,
+covering LCAO and PW basis sets, collinear/noncollinear spin, DFT+U, DeltaSpin, and their combinations.
+
+## Test List (47 cases)
+
+### I. LCAO Spin (01-02)
+
+| # | Test Case | Description |
+|---|------|------|
+| 01 | LCAO_SPIN_S2_Z | Verify basic SCF convergence of collinear spin with LCAO basis, serves as baseline for LCAO magnetic calculations |
+| 02 | LCAO_SPIN_S4_XYZ | Verify basic SCF convergence of noncollinear spin with LCAO basis, covers LCAO noncollinear calculation path |
+
+### II. LCAO DFT+U (03-05)
+
+| # | Test Case | Description |
+|---|------|------|
+| 03 | LCAO_DFTU_S2_Z | Verify coupling of DFT+U (U=5.0eV, l=2) with collinear spin in LCAO basis, ensures correct DFT+U occupation matrix calculation in LCAO path |
+| 04 | LCAO_DFTU_S4_XY | Verify coupling of DFT+U with noncollinear spin (XY magnetization) in LCAO basis, covers nspin=4 occupation matrix calculation in LCAO path |
+| 05 | LCAO_DFTU_S4_XYZ | Verify coupling of DFT+U with noncollinear spin (XYZ magnetization) in LCAO basis, covers the most complete occupation matrix scenario in LCAO path |
+
+### III. PW Spin (06-07)
+
+| # | Test Case | Description |
+|---|------|------|
+| 06 | PW_SPIN_S2_Z | Verify basic SCF convergence of collinear spin with PW basis, serves as baseline for PW magnetic calculations |
+| 07 | PW_SPIN_S4_XYZ | Verify basic SCF convergence of noncollinear spin with PW basis, covers PW noncollinear calculation path |
+
+### IV. PW DFT+U (08-09, 11)
+
+| # | Test Case | Description |
+|---|------|------|
+| 08 | PW_DFTU_S2_Z | Verify coupling of DFT+U (U=5.0eV, l=2) with collinear spin in PW basis, ensures correct DFT+U effective potential calculation in PW path |
+| 09 | PW_DFTU_S4_XY | Verify coupling of DFT+U with noncollinear spin (XY magnetization) in PW basis, covers onsite projection matrix for nspin=4 in PW path |
+| 11 | PW_DFTU_S2_FeO | Verify correctness of DFT+U on FeO system with PW basis, ensures DFT+U correction for Fe-3d orbitals is effective |
+
+### V. PW DeltaSpin (12, 14-16)
+
+| # | Test Case | Description |
+|---|------|------|
+| 12 | PW_DS_S2_Z | Verify coupling of DeltaSpin with collinear spin in PW basis, ensures correct DeltaSpin iterative optimization of magnetization to target values |
+| 14 | PW_DS_S4_XYZ | Verify iterative optimization of noncollinear DeltaSpin under XYZ three-direction magnetization constraint, covers the most complete spin constraint scenario |
+| 15 | PW_DS_S4_Z | Verify behavior of noncollinear DeltaSpin when constraining only Z-direction magnetization, ensures uniaxial constraint does not introduce unphysical XY components in noncolin=1 framework |
+| 16 | PW_DS_S4_XY | Verify iterative optimization of noncollinear DeltaSpin under XY magnetization constraint with a different crystal structure, verifies generalization of noncollinear DeltaSpin XY constraint under different lattices |
+
+### VI. PW DFT+U + DeltaSpin (18-19, 21)
+
+| # | Test Case | Description |
+|---|------|------|
+| 18 | PW_DFTU_DS_S2_Z | Verify coupling of DFT+U with DeltaSpin combined (collinear spin) in PW basis, ensures U correction and magnetization constraint do not conflict |
+| 19 | PW_DFTU_DS_S4_XY | Verify coupling of noncollinear DFT+U+DeltaSpin combined under XY magnetization constraint, covers joint iteration of both methods in nspin=4 path |
+| 21 | PW_DFTU_DS_S4_Z | Verify behavior of noncollinear DFT+U+DeltaSpin combined when constraining only Z-direction magnetization, ensures correct superposition of uniaxial constraint with DFT+U effective potential |
+
+### VII. LCAO DeltaSpin (24, 26-28)
+
+| # | Test Case | Description |
+|---|------|------|
+| 24 | LCAO_DS_S2_Z | Verify coupling of DeltaSpin with collinear spin in LCAO basis, ensures correct spin constraint optimization in LCAO density matrix path |
+| 26 | LCAO_DS_S4_XYZ | Verify iterative optimization of noncollinear DeltaSpin under XYZ three-direction magnetization constraint in LCAO basis, covers the most complete constraint scenario in LCAO path |
+| 27 | LCAO_DS_S4_Z | Verify behavior of noncollinear DeltaSpin when constraining only Z-direction magnetization in LCAO basis, ensures correctness of uniaxial constraint in noncolin=1 framework |
+| 28 | LCAO_DS_S4_XY | Verify iterative optimization of noncollinear DeltaSpin under XY magnetization constraint in LCAO basis with a different crystal structure, verifies generalization of LCAO noncollinear DeltaSpin XY constraint under different lattices |
+
+### VIII. LCAO DFT+U + DeltaSpin (30-33)
+
+| # | Test Case | Description |
+|---|------|------|
+| 30 | LCAO_DFTU_DS_S2_Z | Verify coupling of DFT+U with DeltaSpin combined (collinear spin) in LCAO basis, ensures U correction and magnetization constraint do not conflict in density matrix path |
+| 31 | LCAO_DFTU_DS_S4_XY | Verify coupling of noncollinear DFT+U+DeltaSpin combined under XY magnetization constraint in LCAO basis, covers joint constraint in LCAO density matrix path |
+| 32 | LCAO_DFTU_DS_S4_XYZ | Verify coupling of noncollinear DFT+U+DeltaSpin combined under XYZ three-direction magnetization constraint in LCAO basis, covers the most complete joint scenario in LCAO path |
+| 33 | LCAO_DFTU_DS_S4_Z | Verify behavior of noncollinear DFT+U+DeltaSpin combined when constraining only Z-direction magnetization in LCAO basis, ensures correct superposition of uniaxial constraint with DFT+U density matrix |
+
+### IX. PW DeltaSpin Special Parameters (36-41)
+
+| # | Test Case | Description |
+|---|------|------|
+| 36 | PW_DS_S2_ReadLam_Z | Verify correctness of `nsc=1` mode (read lambda file directly without iterative optimization), ensures DeltaSpin correctly computes magnetization in non-self-consistent lambda mode |
+| 37 | PW_DS_S4_ReadLam_XY | Verify `nsc=1` mode for noncollinear DeltaSpin, covers non-self-consistent lambda path under XY magnetization constraint |
+| 38 | PW_DS_S2_Thr1e10_Z | Verify stability of DeltaSpin under strict convergence threshold (sc_scf_thr=1e-10), ensures iterative optimization converges to high-precision solution |
+| 39 | PW_DS_S4_Thr1e10_XY | Verify stability of noncollinear DeltaSpin under strict convergence threshold (sc_scf_thr=1e-10), covers XY magnetization constraint scenario |
+| 40 | PW_DS_S2_Thr10_Z | Verify behavior of DeltaSpin under loose convergence threshold (sc_scf_thr=10), tests algorithm robustness and out_alllog log output under low precision requirements |
+| 41 | PW_DS_S4_Thr10_XY | Verify behavior of noncollinear DeltaSpin under loose convergence threshold (sc_scf_thr=10), covers low precision scenario with XY magnetization constraint |
+
+### X. PW DFT+U + DeltaSpin Special Parameters (42-45)
+
+| # | Test Case | Description |
+|---|------|------|
+| 42 | PW_DFTU_DS_S2_Thr1e10_Z | Verify iterative stability of DFT+U with DeltaSpin combined under strict convergence threshold (sc_scf_thr=1e-10), ensures convergence when both methods are coupled |
+| 43 | PW_DFTU_DS_S4_Thr1e10_XY | Verify coupling stability of noncollinear DFT+U+DeltaSpin under strict convergence threshold (sc_scf_thr=1e-10), covers XY magnetization constraint |
+| 44 | PW_DFTU_DS_S2_Thr10_Z | Verify behavior of DFT+U with DeltaSpin combined under loose convergence threshold (sc_scf_thr=10), tests coupled algorithm robustness under low precision requirements |
+| 45 | PW_DFTU_DS_S4_Thr10_XY | Verify behavior of noncollinear DFT+U+DeltaSpin under loose convergence threshold (sc_scf_thr=10), covers low precision scenario with XY magnetization constraint |
+
+### XI. FeO Atom Ordering (50-51)
+
+| # | Test Case | Description |
+|---|------|------|
+| 50 | FeO_O_first_Fe_second | Verify correctness of DFT+U in FeO system with O atom type first and Fe second, ensures atom type ordering does not affect DFT+U onsite projection |
+| 51 | FeO_Fe_first_O_second | Verify correctness of DFT+U in FeO system with Fe atom type first and O second, compare with 50 to ensure eff_pot_pw_index indexing is independent of atom type ordering |
+
+### XII. NSCF Mode (55, 60-64)
+
+| # | Test Case | Description |
+|---|------|------|
+| 55 | PW_DS_NSCF_S4_XY | Verify DeltaSpin functionality in non-self-consistent (nscf) calculation mode, ensures lambda constraint is applied correctly without charge update |
+| 60 | PW_DFTU_DS_NSCF_Band_XY | Verify DFT+U+DeltaSpin in NSCF band structure calculation, tests band output with spin constraints on high-symmetry k-point path |
+| 61 | LCAO_DS_NSCF_S4_XY | Verify LCAO DeltaSpin functionality in nscf calculation mode |
+| 62 | LCAO_DFTU_NSCF_Band_XY | Verify LCAO DFT+U (without DeltaSpin) in NSCF band structure calculation; note: runs as `calculation = scf` with `scf_nmax = 1` using pre-converged charge density |
+| 63 | LCAO_DFTU_DS_NSCF_Band_XY | Verify LCAO DFT+U+DeltaSpin in NSCF band structure calculation, tests band output with spin constraints |
+| 64 | PW_DFTU_NSCF_Band_XY | Verify DFT+U (without DeltaSpin) in NSCF band structure calculation, tests band output with Hubbard U correction |
+
+### XIII. sc_direction_only Constraint (56-59)
+
+| # | Test Case | Description |
+|---|------|------|
+| 56 | PW_DS_S4_DirectionOnly_XY | Verify `sc_direction_only=1` mode: only magnetization direction is constrained while magnitude is free to relax, projects lambda perpendicular to target direction |
+| 57 | PW_DFTU_DS_S4_DirectionOnly_XY | Verify `sc_direction_only=1` combined with DFT+U, tests direction-only constraint superposition with Hubbard U correction |
+| 58 | LCAO_DS_S4_DirectionOnly_XY | Verify `sc_direction_only=1` in LCAO basis, ensures direction-only constraint works correctly in LCAO density matrix path |
+| 59 | LCAO_DFTU_DS_S4_DirectionOnly_XY | Verify `sc_direction_only=1` combined with DFT+U in LCAO basis, tests full direction-only constraint in LCAO path |
+
+## Running Tests
+
+```bash
+# Run all tests
+cd tests/17_DS_DFTU
+bash ../integrate/Autotest.sh -a <abacus_path> -n 4
+
+# Run a single test
+cd 08_PW_DFTU_S2_Z
+bash ../../integrate/run_debug.sh ""
+```
+
+## CI-Disabled Tests
+
+The following test cases are disabled in `CASES_CPU.txt` (commented out with `#`) and excluded from CI testing due to **convergence and numerical stability issues**. They can be manually unskipped for local testing by removing the `#` prefix.
+
+| # | Test Case | Reason |
+|---|------|--------|
+| 02 | LCAO_SPIN_S4_XYZ | Convergence / numerical stability |
+| 04 | LCAO_DFTU_S4_XY | Convergence / numerical stability |
+| 05 | LCAO_DFTU_S4_XYZ | Convergence / numerical stability |
+| 24 | LCAO_DS_S2_Z | Convergence / numerical stability |
+| 26 | LCAO_DS_S4_XYZ | Convergence / numerical stability |
+| 27 | LCAO_DS_S4_Z | Convergence / numerical stability |
+| 28 | LCAO_DS_S4_XY | Convergence / numerical stability |
+| 30 | LCAO_DFTU_DS_S2_Z | Convergence / numerical stability |
+| 31 | LCAO_DFTU_DS_S4_XY | Convergence / numerical stability |
+| 32 | LCAO_DFTU_DS_S4_XYZ | Convergence / numerical stability |
+| 33 | LCAO_DFTU_DS_S4_Z | Convergence / numerical stability |
+| 44 | PW_DFTU_DS_S2_Thr10_Z | Convergence / numerical stability |
+| 58 | LCAO_DS_S4_DirectionOnly_XY | Convergence / numerical stability |
+| 59 | LCAO_DFTU_DS_S4_DirectionOnly_XY | Convergence / numerical stability |
+| 62 | LCAO_DFTU_NSCF_Band_XY | Convergence / numerical stability; genelpa eigenvalue inconsistency across thread counts (scalapack_gvx consistent) |
+| 63 | LCAO_DFTU_DS_NSCF_Band_XY | Convergence / numerical stability |
+
+## Test Condition Notes
+
+- 09 (PW DFT+U + noncollinear): Only supports **2-process MPI** execution, `result.ref` reference files provided
+- The following test cases set `kpar=2` in INPUT and require at least **2 MPI processes** to run: 11, 12, 14, 15, 16, 18, 19, 21, 37, 39, 41, 43, 45
+- 62 (LCAO_DFTU_NSCF_Band_XY): Single-thread and multi-thread results are inconsistent; investigation shows HR, HK, and SK are consistent across threads, but eigenvalues from genelpa differ; switching to scalapack_gvx produces consistent results across thread counts. Note: this test is named "NSCF" but actually runs with `calculation = scf` (`scf_nmax = 1`), using pre-shipped charge density and onsite.dm files as initial guess
+- All NSCF tests (55, 60, 61, 63, 64) and test 62 ship pre-converged `autotest-CHARGE-DENSITY.restart` files; DFT+U NSCF tests (60, 63, 64) and test 62 additionally ship pre-converged `onsite.dm` files. These files are self-contained in each test directory — no runtime dependency on other tests
+- All LCAO basis tests use `ks_solver = genelpa`. The genelpa eigenvalue inconsistency across thread counts observed in test 62 may potentially affect other LCAO tests as well
\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 83f1f326297..17d69cfd360 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -5,6 +5,7 @@ add_subdirectory(03_NAO_multik)
 add_subdirectory(04_FF)
 add_subdirectory(05_rtTDDFT)
 add_subdirectory(06_SDFT)
+add_subdirectory(17_DS_DFTU)
 add_subdirectory(07_OFDFT)
 add_subdirectory(08_EXX)
 add_subdirectory(10_others)
diff --git a/tests/PP_ORB/O.upf b/tests/PP_ORB/O.upf
new file mode 100644
index 00000000000..7e7db6d66f6
--- /dev/null
+++ b/tests/PP_ORB/O.upf
@@ -0,0 +1,1224 @@
+<UPF version="2.0.1">
+  <PP_INFO>
+
+ This pseudopotential file has been produced using the code
+ ONCVPSP  (Optimized Norm-Conservinng Vanderbilt PSeudopotential)
+ scalar-relativistic version 2.1.1, 03/26/2014 by D. R. Hamann
+ The code is available through a link at URL www.mat-simresearch.com.
+ Documentation with the package provides a full discription of the
+ input data below.
+
+
+ While it is not required under the terms of the GNU GPL, it is
+ suggested that you cite D. R. Hamann, Phys. Rev. B 88, 085117 (2013)
+ in any publication using these pseudopotentials.
+
+
+ Copyright 2015 The Regents of the University of California
+ 
+ This work is licensed under the Creative Commons Attribution-ShareAlike 
+ 4.0 International License. To view a copy of this license, visit 
+ http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to 
+ Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+ 
+ This pseudopotential is part of the Schlipf-Gygi norm-conserving 
+ pseudopotential library. Its construction parameters were tuned to 
+ reproduce materials of a training set with very high accuracy and 
+ should be suitable as a general purpose pseudopotential to treat a 
+ variety of different compounds. For details of the construction and 
+ testing of the pseudopotential please refer to:
+ 
+ [insert reference to paper here]
+ 
+ We kindly ask that you include this reference in all publications 
+ associated to this pseudopotential.
+
+
+    <PP_INPUTFILE>
+# ATOM AND REFERENCE CONFIGURATION
+# atsym  z    nc    nv    iexc   psfile
+  O  8.00     1     2     4      upf
+#
+#   n    l    f        energy (Ha)
+    1    0    2.00
+    2    0    2.00
+    2    1    4.00
+#
+# PSEUDOPOTENTIAL AND OPTIMIZATION
+# lmax
+    1
+#
+#   l,   rc,     ep,   ncon, nbas, qcut
+    0   1.29195  -0.88057    5    8   8.98916
+    1   1.47310  -0.33187    5    8   9.14990
+#
+# LOCAL POTENTIAL
+# lloc, lpopt,  rc(5),   dvloc0
+    4    5   0.90330      0.00000
+#
+# VANDERBILT-KLEINMAN-BYLANDER PROJECTORs
+# l, nproj, debl
+    0    2   1.51851
+    1    2   1.53631
+#
+# MODEL CORE CHARGE
+# icmod, fcfact
+    0   0.00000
+#
+# LOG DERIVATIVE ANALYSIS
+# epsh1, epsh2, depsh
+   -5.00    3.00    0.02
+#
+# OUTPUT GRID
+# rlmax, drl
+    6.00    0.01
+#
+# TEST CONFIGURATIONS
+# ncnf
+    0
+# nvcnf
+#   n    l    f
+    </PP_INPUTFILE>
+  </PP_INFO>
+  <!--                               -->
+  <!-- END OF HUMAN READABLE SECTION -->
+  <!--                               -->
+    <PP_HEADER
+       generated="Generated using ONCVPSP code by D. R. Hamann"
+       author="Martin Schlipf and Francois Gygi"
+       date="150105"
+       comment=""
+       element="O "
+       pseudo_type="NC"
+       relativistic="scalar"
+       is_ultrasoft="F"
+       is_paw="F"
+       is_coulomb="F"
+       has_so="F"
+       has_wfc="F"
+       has_gipaw="F"
+       core_correction="F"
+       functional="PBE"
+       z_valence="    6.00"
+       total_psenergy="  -1.57181652287E+01"
+       rho_cutoff="   6.01000000000E+00"
+       l_max="1"
+       l_local="-1"
+       mesh_size="   602"
+       number_of_wfc="0"
+       number_of_proj="4"/>
+ <PP_MESH>
+   <PP_R type="real"  size=" 602" columns="8">
+    0.0000    0.0100    0.0200    0.0300    0.0400    0.0500    0.0600    0.0700
+    0.0800    0.0900    0.1000    0.1100    0.1200    0.1300    0.1400    0.1500
+    0.1600    0.1700    0.1800    0.1900    0.2000    0.2100    0.2200    0.2300
+    0.2400    0.2500    0.2600    0.2700    0.2800    0.2900    0.3000    0.3100
+    0.3200    0.3300    0.3400    0.3500    0.3600    0.3700    0.3800    0.3900
+    0.4000    0.4100    0.4200    0.4300    0.4400    0.4500    0.4600    0.4700
+    0.4800    0.4900    0.5000    0.5100    0.5200    0.5300    0.5400    0.5500
+    0.5600    0.5700    0.5800    0.5900    0.6000    0.6100    0.6200    0.6300
+    0.6400    0.6500    0.6600    0.6700    0.6800    0.6900    0.7000    0.7100
+    0.7200    0.7300    0.7400    0.7500    0.7600    0.7700    0.7800    0.7900
+    0.8000    0.8100    0.8200    0.8300    0.8400    0.8500    0.8600    0.8700
+    0.8800    0.8900    0.9000    0.9100    0.9200    0.9300    0.9400    0.9500
+    0.9600    0.9700    0.9800    0.9900    1.0000    1.0100    1.0200    1.0300
+    1.0400    1.0500    1.0600    1.0700    1.0800    1.0900    1.1000    1.1100
+    1.1200    1.1300    1.1400    1.1500    1.1600    1.1700    1.1800    1.1900
+    1.2000    1.2100    1.2200    1.2300    1.2400    1.2500    1.2600    1.2700
+    1.2800    1.2900    1.3000    1.3100    1.3200    1.3300    1.3400    1.3500
+    1.3600    1.3700    1.3800    1.3900    1.4000    1.4100    1.4200    1.4300
+    1.4400    1.4500    1.4600    1.4700    1.4800    1.4900    1.5000    1.5100
+    1.5200    1.5300    1.5400    1.5500    1.5600    1.5700    1.5800    1.5900
+    1.6000    1.6100    1.6200    1.6300    1.6400    1.6500    1.6600    1.6700
+    1.6800    1.6900    1.7000    1.7100    1.7200    1.7300    1.7400    1.7500
+    1.7600    1.7700    1.7800    1.7900    1.8000    1.8100    1.8200    1.8300
+    1.8400    1.8500    1.8600    1.8700    1.8800    1.8900    1.9000    1.9100
+    1.9200    1.9300    1.9400    1.9500    1.9600    1.9700    1.9800    1.9900
+    2.0000    2.0100    2.0200    2.0300    2.0400    2.0500    2.0600    2.0700
+    2.0800    2.0900    2.1000    2.1100    2.1200    2.1300    2.1400    2.1500
+    2.1600    2.1700    2.1800    2.1900    2.2000    2.2100    2.2200    2.2300
+    2.2400    2.2500    2.2600    2.2700    2.2800    2.2900    2.3000    2.3100
+    2.3200    2.3300    2.3400    2.3500    2.3600    2.3700    2.3800    2.3900
+    2.4000    2.4100    2.4200    2.4300    2.4400    2.4500    2.4600    2.4700
+    2.4800    2.4900    2.5000    2.5100    2.5200    2.5300    2.5400    2.5500
+    2.5600    2.5700    2.5800    2.5900    2.6000    2.6100    2.6200    2.6300
+    2.6400    2.6500    2.6600    2.6700    2.6800    2.6900    2.7000    2.7100
+    2.7200    2.7300    2.7400    2.7500    2.7600    2.7700    2.7800    2.7900
+    2.8000    2.8100    2.8200    2.8300    2.8400    2.8500    2.8600    2.8700
+    2.8800    2.8900    2.9000    2.9100    2.9200    2.9300    2.9400    2.9500
+    2.9600    2.9700    2.9800    2.9900    3.0000    3.0100    3.0200    3.0300
+    3.0400    3.0500    3.0600    3.0700    3.0800    3.0900    3.1000    3.1100
+    3.1200    3.1300    3.1400    3.1500    3.1600    3.1700    3.1800    3.1900
+    3.2000    3.2100    3.2200    3.2300    3.2400    3.2500    3.2600    3.2700
+    3.2800    3.2900    3.3000    3.3100    3.3200    3.3300    3.3400    3.3500
+    3.3600    3.3700    3.3800    3.3900    3.4000    3.4100    3.4200    3.4300
+    3.4400    3.4500    3.4600    3.4700    3.4800    3.4900    3.5000    3.5100
+    3.5200    3.5300    3.5400    3.5500    3.5600    3.5700    3.5800    3.5900
+    3.6000    3.6100    3.6200    3.6300    3.6400    3.6500    3.6600    3.6700
+    3.6800    3.6900    3.7000    3.7100    3.7200    3.7300    3.7400    3.7500
+    3.7600    3.7700    3.7800    3.7900    3.8000    3.8100    3.8200    3.8300
+    3.8400    3.8500    3.8600    3.8700    3.8800    3.8900    3.9000    3.9100
+    3.9200    3.9300    3.9400    3.9500    3.9600    3.9700    3.9800    3.9900
+    4.0000    4.0100    4.0200    4.0300    4.0400    4.0500    4.0600    4.0700
+    4.0800    4.0900    4.1000    4.1100    4.1200    4.1300    4.1400    4.1500
+    4.1600    4.1700    4.1800    4.1900    4.2000    4.2100    4.2200    4.2300
+    4.2400    4.2500    4.2600    4.2700    4.2800    4.2900    4.3000    4.3100
+    4.3200    4.3300    4.3400    4.3500    4.3600    4.3700    4.3800    4.3900
+    4.4000    4.4100    4.4200    4.4300    4.4400    4.4500    4.4600    4.4700
+    4.4800    4.4900    4.5000    4.5100    4.5200    4.5300    4.5400    4.5500
+    4.5600    4.5700    4.5800    4.5900    4.6000    4.6100    4.6200    4.6300
+    4.6400    4.6500    4.6600    4.6700    4.6800    4.6900    4.7000    4.7100
+    4.7200    4.7300    4.7400    4.7500    4.7600    4.7700    4.7800    4.7900
+    4.8000    4.8100    4.8200    4.8300    4.8400    4.8500    4.8600    4.8700
+    4.8800    4.8900    4.9000    4.9100    4.9200    4.9300    4.9400    4.9500
+    4.9600    4.9700    4.9800    4.9900    5.0000    5.0100    5.0200    5.0300
+    5.0400    5.0500    5.0600    5.0700    5.0800    5.0900    5.1000    5.1100
+    5.1200    5.1300    5.1400    5.1500    5.1600    5.1700    5.1800    5.1900
+    5.2000    5.2100    5.2200    5.2300    5.2400    5.2500    5.2600    5.2700
+    5.2800    5.2900    5.3000    5.3100    5.3200    5.3300    5.3400    5.3500
+    5.3600    5.3700    5.3800    5.3900    5.4000    5.4100    5.4200    5.4300
+    5.4400    5.4500    5.4600    5.4700    5.4800    5.4900    5.5000    5.5100
+    5.5200    5.5300    5.5400    5.5500    5.5600    5.5700    5.5800    5.5900
+    5.6000    5.6100    5.6200    5.6300    5.6400    5.6500    5.6600    5.6700
+    5.6800    5.6900    5.7000    5.7100    5.7200    5.7300    5.7400    5.7500
+    5.7600    5.7700    5.7800    5.7900    5.8000    5.8100    5.8200    5.8300
+    5.8400    5.8500    5.8600    5.8700    5.8800    5.8900    5.9000    5.9100
+    5.9200    5.9300    5.9400    5.9500    5.9600    5.9700    5.9800    5.9900
+    6.0000    6.0100
+   </PP_R>
+   <PP_RAB type="real"  size=" 602" columns="8">
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100    0.0100
+    0.0100    0.0100
+   </PP_RAB>
+ </PP_MESH>
+  <PP_LOCAL type="real"  size=" 602" columns="4">
+   -2.7605700345E+01   -3.0784865229E+01   -3.2349253618E+01   -3.2751366129E+01
+   -3.2443703381E+01   -3.1938024293E+01   -3.1464309182E+01   -3.1081089156E+01
+   -3.0780110534E+01   -3.0537539935E+01   -3.0331510013E+01   -3.0145737072E+01
+   -2.9968950315E+01   -2.9793612997E+01   -2.9614858119E+01   -2.9429745358E+01
+   -2.9236754295E+01   -2.9035419392E+01   -2.8826044018E+01   -2.8609460457E+01
+   -2.8386823980E+01   -2.8159440471E+01   -2.7928630453E+01   -2.7695631454E+01
+   -2.7461537210E+01   -2.7227268680E+01   -2.6993569183E+01   -2.6761015821E+01
+   -2.6530039202E+01   -2.6300945472E+01   -2.6073936437E+01   -2.5849125236E+01
+   -2.5626546546E+01   -2.5406161554E+01   -2.5187858066E+01   -2.4971448643E+01
+   -2.4756666888E+01   -2.4543166280E+01   -2.4330521781E+01   -2.4118237688E+01
+   -2.3905762462E+01   -2.3692510781E+01   -2.3477892157E+01   -2.3261343931E+01
+   -2.3042365424E+01   -2.2820549497E+01   -2.2595607833E+01   -2.2367387034E+01
+   -2.2135873927E+01   -2.1901189722E+01   -2.1663577130E+01   -2.1423377048E+01
+   -2.1181003715E+01   -2.0936919717E+01   -2.0691610496E+01   -2.0445564221E+01
+   -2.0199254652E+01   -1.9953130261E+01   -1.9707605286E+01   -1.9463057569E+01
+   -1.9219826157E+01   -1.8978213343E+01   -1.8738487400E+01   -1.8500884660E+01
+   -1.8265614596E+01   -1.8032862583E+01   -1.7802792920E+01   -1.7575552408E+01
+   -1.7351272692E+01   -1.7130071760E+01   -1.6912056036E+01   -1.6697321537E+01
+   -1.6485954716E+01   -1.6278033179E+01   -1.6073625719E+01   -1.5872792819E+01
+   -1.5675586595E+01   -1.5482050684E+01   -1.5292220051E+01   -1.5106120725E+01
+   -1.4923769479E+01   -1.4745173477E+01   -1.4570329881E+01   -1.4399225454E+01
+   -1.4231836134E+01   -1.4068126609E+01   -1.3908049886E+01   -1.3751546850E+01
+   -1.3598545829E+01   -1.3448962145E+01   -1.3302697611E+01   -1.3159640031E+01
+   -1.3019669910E+01   -1.2882669787E+01   -1.2748527869E+01   -1.2617137724E+01
+   -1.2488398068E+01   -1.2362212574E+01   -1.2238489692E+01   -1.2117142884E+01
+   -1.1998090066E+01   -1.1881253628E+01   -1.1766560302E+01   -1.1653941032E+01
+   -1.1543330569E+01   -1.1434668196E+01   -1.1327896756E+01   -1.1222962743E+01
+   -1.1119816109E+01   -1.1018409929E+01   -1.0918701041E+01   -1.0820648952E+01
+   -1.0724215912E+01   -1.0629366457E+01   -1.0536068056E+01   -1.0444290028E+01
+   -1.0354003461E+01   -1.0265180964E+01   -1.0177796938E+01   -1.0091826632E+01
+   -1.0007246112E+01   -9.9240322892E+00   -9.8421624187E+00   -9.7616139299E+00
+   -9.6823643703E+00   -9.6043910435E+00   -9.5276710191E+00   -9.4521810563E+00
+   -9.3778971539E+00   -9.3047950086E+00   -9.2328496504E+00   -9.1620363695E+00
+   -9.0923305110E+00   -9.0237077361E+00   -8.9561431451E+00   -8.8896116158E+00
+   -8.8240890211E+00   -8.7595512275E+00   -8.6959740871E+00   -8.6333343002E+00
+   -8.5716086751E+00   -8.5107748378E+00   -8.4508110921E+00   -8.3916960209E+00
+   -8.3334097570E+00   -8.2759323779E+00   -8.2192454990E+00   -8.1633312824E+00
+   -8.1081727717E+00   -8.0537547331E+00   -8.0000623436E+00   -7.9470813275E+00
+   -7.8947972321E+00   -7.8431966714E+00   -7.7922661900E+00   -7.7419928953E+00
+   -7.6923641381E+00   -7.6433675558E+00   -7.5949912477E+00   -7.5472233618E+00
+   -7.5000526824E+00   -7.4534678545E+00   -7.4074582876E+00   -7.3620131113E+00
+   -7.3171223055E+00   -7.2727754717E+00   -7.2289631023E+00   -7.1856752817E+00
+   -7.1429029518E+00   -7.1006366576E+00   -7.0588677730E+00   -7.0175872653E+00
+   -6.9767869231E+00   -6.9364581153E+00   -6.8965930059E+00   -6.8571833718E+00
+   -6.8182217002E+00   -6.7797001743E+00   -6.7416115702E+00   -6.7039484718E+00
+   -6.6667039108E+00   -6.6298708673E+00   -6.5934425974E+00   -6.5574124728E+00
+   -6.5217739457E+00   -6.4865207734E+00   -6.4516465808E+00   -6.4171455001E+00
+   -6.3830113137E+00   -6.3492384567E+00   -6.3158210267E+00   -6.2827535725E+00
+   -6.2500305645E+00   -6.2176466374E+00   -6.1855966333E+00   -6.1538752555E+00
+   -6.1224776754E+00   -6.0913987519E+00   -6.0606338291E+00   -6.0301780858E+00
+   -6.0000268845E+00   -5.9701757684E+00   -5.9406201055E+00   -5.9113557302E+00
+   -5.8823781953E+00   -5.8536833983E+00   -5.8252672254E+00   -5.7971255362E+00
+   -5.7692545325E+00   -5.7416501530E+00   -5.7143087187E+00   -5.6872264675E+00
+   -5.6603996483E+00   -5.6338248102E+00   -5.6074982583E+00   -5.5814166408E+00
+   -5.5555765569E+00   -5.5299745536E+00   -5.5046075112E+00   -5.4794720848E+00
+   -5.4545651655E+00   -5.4298837212E+00   -5.4054245210E+00   -5.3811847580E+00
+   -5.3571614267E+00   -5.3333515869E+00   -5.3097525260E+00   -5.2863613325E+00
+   -5.2631753334E+00   -5.2401918998E+00   -5.2174082208E+00   -5.1948218607E+00
+   -5.1724302389E+00   -5.1502307471E+00   -5.1282210592E+00   -5.1063986772E+00
+   -5.0847611945E+00   -5.0633063698E+00   -5.0420317959E+00   -5.0209352464E+00
+   -5.0000145619E+00   -4.9792674276E+00   -4.9586917756E+00   -4.9382855278E+00
+   -4.9180464624E+00   -4.8979726494E+00   -4.8780620924E+00   -4.8583126624E+00
+   -4.8387225484E+00   -4.8192898345E+00   -4.8000124892E+00   -4.7808887980E+00
+   -4.7619169283E+00   -4.7430949440E+00   -4.7244212124E+00   -4.7058939893E+00
+   -4.6875114188E+00   -4.6692719493E+00   -4.6511739183E+00   -4.6332155580E+00
+   -4.6153953755E+00   -4.5977117925E+00   -4.5801631296E+00   -4.5627479386E+00
+   -4.5454647266E+00   -4.5283119032E+00   -4.5112880519E+00   -4.4943917650E+00
+   -4.4776215414E+00   -4.4609759844E+00   -4.4444537713E+00   -4.4280534916E+00
+   -4.4117737574E+00   -4.3956133298E+00   -4.3795708900E+00   -4.3636450498E+00
+   -4.3478346519E+00   -4.3321384475E+00   -4.3165551080E+00   -4.3010834883E+00
+   -4.2857224193E+00   -4.2704706626E+00   -4.2553270471E+00   -4.2402904912E+00
+   -4.2253598492E+00   -4.2105339172E+00   -4.1958116967E+00   -4.1811921057E+00
+   -4.1666739956E+00   -4.1522563551E+00   -4.1379381819E+00   -4.1237184204E+00
+   -4.1095960051E+00   -4.0955700209E+00   -4.0816394841E+00   -4.0678033425E+00
+   -4.0540606926E+00   -4.0404106187E+00   -4.0268521620E+00   -4.0133843475E+00
+   -4.0000063484E+00   -3.9867172703E+00   -3.9735161622E+00   -3.9604021828E+00
+   -3.9473745101E+00   -3.9344322882E+00   -3.9215745909E+00   -3.9088006833E+00
+   -3.8961097526E+00   -3.8835009538E+00   -3.8709734686E+00   -3.8585265734E+00
+   -3.8461594934E+00   -3.8338713986E+00   -3.8216615661E+00   -3.8095292837E+00
+   -3.7974738126E+00   -3.7854943426E+00   -3.7735902317E+00   -3.7617607813E+00
+   -3.7500052762E+00   -3.7383229617E+00   -3.7267132311E+00   -3.7151754129E+00
+   -3.7037088125E+00   -3.6923127373E+00   -3.6809865997E+00   -3.6697297595E+00
+   -3.6585415452E+00   -3.6474213158E+00   -3.6363685045E+00   -3.6253825007E+00
+   -3.6144626587E+00   -3.6036083788E+00   -3.5928191167E+00   -3.5820942902E+00
+   -3.5714332815E+00   -3.5608355222E+00   -3.5503004929E+00   -3.5398276385E+00
+   -3.5294163709E+00   -3.5190661440E+00   -3.5087764652E+00   -3.4985468053E+00
+   -3.4883766076E+00   -3.4782653400E+00   -3.4682125383E+00   -3.4582176981E+00
+   -3.4482802961E+00   -3.4383998061E+00   -3.4285757940E+00   -3.4188077789E+00
+   -3.4090952728E+00   -3.3994377480E+00   -3.3898348019E+00   -3.3802859760E+00
+   -3.3707908119E+00   -3.3613487964E+00   -3.3519595367E+00   -3.3426226031E+00
+   -3.3333375585E+00   -3.3241039280E+00   -3.3149213002E+00   -3.3057892823E+00
+   -3.2967074576E+00   -3.2876753919E+00   -3.2786926486E+00   -3.2697588729E+00
+   -3.2608736674E+00   -3.2520366348E+00   -3.2432473253E+00   -3.2345054052E+00
+   -3.2258105008E+00   -3.2171622335E+00   -3.2085601965E+00   -3.2000040171E+00
+   -3.1914933640E+00   -3.1830278760E+00   -3.1746071920E+00   -3.1662308963E+00
+   -3.1578986987E+00   -3.1496102547E+00   -3.1413652201E+00   -3.1331632249E+00
+   -3.1250039281E+00   -3.1168870307E+00   -3.1088122044E+00   -3.1007791210E+00
+   -3.0927874062E+00   -3.0848367838E+00   -3.0769269476E+00   -3.0690575847E+00
+   -3.0612283700E+00   -3.0534389657E+00   -3.0456891132E+00   -3.0379785141E+00
+   -3.0303068700E+00   -3.0226738555E+00   -3.0150791818E+00   -3.0075225888E+00
+   -3.0000037918E+00   -2.9925225065E+00   -2.9850784103E+00   -2.9776712548E+00
+   -2.9703007806E+00   -2.9629667163E+00   -2.9556687906E+00   -2.9484066870E+00
+   -2.9411801890E+00   -2.9339890404E+00   -2.9268329824E+00   -2.9197117526E+00
+   -2.9126250543E+00   -2.9055726836E+00   -2.8985543940E+00   -2.8915699388E+00
+   -2.8846190644E+00   -2.8777014940E+00   -2.8708170314E+00   -2.8639654413E+00
+   -2.8571464888E+00   -2.8503599315E+00   -2.8436055055E+00   -2.8368830243E+00
+   -2.8301922637E+00   -2.8235329996E+00   -2.8169050030E+00   -2.8103080169E+00
+   -2.8037418662E+00   -2.7972063374E+00   -2.7907012166E+00   -2.7842262903E+00
+   -2.7777813024E+00   -2.7713660913E+00   -2.7649804532E+00   -2.7586241845E+00
+   -2.7522970813E+00   -2.7459989054E+00   -2.7397294879E+00   -2.7334886419E+00
+   -2.7272761732E+00   -2.7210918876E+00   -2.7149355663E+00   -2.7088070278E+00
+   -2.7027061039E+00   -2.6966326094E+00   -2.6905863591E+00   -2.6845671557E+00
+   -2.6785748003E+00   -2.6726091446E+00   -2.6666700119E+00   -2.6607572257E+00
+   -2.6548706095E+00   -2.6490099504E+00   -2.6431751131E+00   -2.6373659318E+00
+   -2.6315822382E+00   -2.6258238639E+00   -2.6200906210E+00   -2.6143823457E+00
+   -2.6086988964E+00   -2.6030401127E+00   -2.5974058340E+00   -2.5917958991E+00
+   -2.5862101123E+00   -2.5806483565E+00   -2.5751104788E+00   -2.5695963262E+00
+   -2.5641057455E+00   -2.5586385667E+00   -2.5531946375E+00   -2.5477738314E+00
+   -2.5423760028E+00   -2.5370010057E+00   -2.5316486943E+00   -2.5263188926E+00
+   -2.5210114847E+00   -2.5157263372E+00   -2.5104633109E+00   -2.5052222669E+00
+   -2.5000030605E+00   -2.4948055297E+00   -2.4896295712E+00   -2.4844750523E+00
+   -2.4793418405E+00   -2.4742298031E+00   -2.4691387940E+00   -2.4640686757E+00
+   -2.4590193426E+00   -2.4539906681E+00   -2.4489825259E+00   -2.4439947896E+00
+   -2.4390273136E+00   -2.4340799794E+00   -2.4291526807E+00   -2.4242452972E+00
+   -2.4193577083E+00   -2.4144897934E+00   -2.4096414103E+00   -2.4048124528E+00
+   -2.4000028171E+00   -2.3952123882E+00   -2.3904410511E+00   -2.3856886909E+00
+   -2.3809551705E+00   -2.3762403912E+00   -2.3715442531E+00   -2.3668666465E+00
+   -2.3622074621E+00   -2.3575665900E+00   -2.3529439004E+00   -2.3483392967E+00
+   -2.3437526852E+00   -2.3391839614E+00   -2.3346330208E+00   -2.3300997590E+00
+   -2.3255840547E+00   -2.3210858091E+00   -2.3166049363E+00   -2.3121413368E+00
+   -2.3076949109E+00   -2.3032655590E+00   -2.2988531707E+00   -2.2944576400E+00
+   -2.2900788908E+00   -2.2857168282E+00   -2.2813713571E+00   -2.2770423826E+00
+   -2.2727298068E+00   -2.2684335124E+00   -2.2641534347E+00   -2.2598894829E+00
+   -2.2556415666E+00   -2.2514095953E+00   -2.2471934785E+00   -2.2429931038E+00
+   -2.2388083992E+00   -2.2346392847E+00   -2.2304856742E+00   -2.2263474813E+00
+   -2.2222246197E+00   -2.2181169931E+00   -2.2140245088E+00   -2.2099471023E+00
+   -2.2058846913E+00   -2.2018371933E+00   -2.1978045263E+00   -2.1937866078E+00
+   -2.1897833323E+00   -2.1857946406E+00   -2.1818204576E+00   -2.1778607050E+00
+   -2.1739153043E+00   -2.1699841770E+00   -2.1660672366E+00   -2.1621643955E+00
+   -2.1582755972E+00   -2.1544007668E+00   -2.1505398296E+00   -2.1466927107E+00
+   -2.1428593354E+00   -2.1390396118E+00   -2.1352334757E+00   -2.1314408644E+00
+   -2.1276617067E+00   -2.1238959313E+00   -2.1201434667E+00   -2.1164042418E+00
+   -2.1126781618E+00   -2.1089651802E+00   -2.1052652307E+00   -2.1015782452E+00
+   -2.0979041558E+00   -2.0942428944E+00   -2.0905943898E+00   -2.0869585565E+00
+   -2.0833353508E+00   -2.0797247079E+00   -2.0761265629E+00   -2.0725408510E+00
+   -2.0689675074E+00   -2.0654064613E+00   -2.0618576366E+00   -2.0583209894E+00
+   -2.0547964579E+00   -2.0512839803E+00   -2.0477834948E+00   -2.0442949396E+00
+   -2.0408182464E+00   -2.0373533437E+00   -2.0339001895E+00   -2.0304587249E+00
+   -2.0270288910E+00   -2.0236106288E+00   -2.0202038794E+00   -2.0168085786E+00
+   -2.0134246558E+00   -2.0100520723E+00   -2.0066907719E+00   -2.0033406985E+00
+   -2.0000017959E+00   -1.9966740079E+00
+  </PP_LOCAL>
+ <PP_NONLOCAL>
+   <PP_BETA.1
+       type="real"
+       size=" 602"
+       columns="4"
+       index="1"
+       angular_momentum="0"
+       cutoff_radius_index=" 152"
+       cutoff_radius="    1.5100000000E+00" >
+    0.0000000000E+00   -8.2277987587E-02   -1.6449650094E-01   -2.4659331589E-01
+   -3.2850076507E-01   -4.1014315697E-01   -4.9143436126E-01   -5.7227561145E-01
+   -6.5255357242E-01   -7.3213871601E-01   -8.1088404273E-01   -8.8862418182E-01
+   -9.6517489579E-01   -1.0403330086E+00   -1.1138767696E+00   -1.1855666583E+00
+   -1.2551466268E+00   -1.3223457701E+00   -1.3868804065E+00   -1.4484565448E+00
+   -1.5067727038E+00   -1.5615230499E+00   -1.6124008059E+00   -1.6591018874E+00
+   -1.7013287065E+00   -1.7387940878E+00   -1.7712252542E+00   -1.7983677872E+00
+   -1.8199895278E+00   -1.8358843512E+00   -1.8458757490E+00   -1.8498201626E+00
+   -1.8476100093E+00   -1.8391763180E+00   -1.8244910913E+00   -1.8035689946E+00
+   -1.7764688338E+00   -1.7432942635E+00   -1.7041941790E+00   -1.6593624450E+00
+   -1.6090370910E+00   -1.5534989864E+00   -1.4930699667E+00   -1.4281104400E+00
+   -1.3590165014E+00   -1.2862165869E+00   -1.2101677086E+00   -1.1313513155E+00
+   -1.0502688403E+00   -9.6743710159E-01   -8.8338307362E-01   -7.9863905513E-01
+   -7.1373774907E-01   -6.2920651981E-01   -5.4556280508E-01   -4.6330924845E-01
+   -3.8292814427E-01   -3.0487830138E-01   -2.2958906110E-01   -1.5745833560E-01
+   -8.8847539142E-02   -2.4079508545E-02    3.6563691834E-02    9.2844371601E-02
+    1.4456869600E-01    1.9158861786E-01    2.3380285278E-01    2.7115554880E-01
+    3.0363571638E-01    3.3127718718E-01    3.5415591151E-01    3.7238794759E-01
+    3.8612701355E-01    3.9556147997E-01    4.0091144918E-01    4.0242475349E-01
+    4.0037331823E-01    3.9504921005E-01    3.8676054393E-01    3.7582731451E-01
+    3.6257720433E-01    3.4734141910E-01    3.3045059692E-01    3.1223083453E-01
+    2.9299986887E-01    2.7306344758E-01    2.5271191615E-01    2.3221704416E-01
+    2.1182910699E-01    1.9177423337E-01    1.7225200095E-01    1.5343329037E-01
+    1.3546088278E-01    1.1845195585E-01    1.0249965400E-01    8.7673842851E-02
+    7.4022153762E-02    6.1571422473E-02    5.0329214779E-02    4.0284462837E-02
+    3.1410783526E-02    2.3667878040E-02    1.7003522416E-02    1.1355567187E-02
+    6.6538808501E-03    2.8220678737E-03   -2.2001240793E-04   -2.5547866842E-03
+   -4.2654366871E-03   -5.4349451843E-03   -6.1429431867E-03   -6.4660691080E-03
+   -6.4767064284E-03   -6.2432624449E-03   -5.8262992394E-03   -5.2807905609E-03
+   -4.6556441299E-03   -3.9940297562E-03   -3.3299352117E-03   -2.6921631841E-03
+   -2.1041805481E-03   -1.5819270303E-03   -1.1356467997E-03   -7.7141771203E-04
+   -4.9002008278E-04   -2.8685599057E-04   -1.5440700844E-04   -8.1295680504E-05
+   -5.2958831332E-05   -4.9024979272E-05   -2.2654023957E-05    2.0802206378E-06
+    1.6220072646E-06    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00
+   </PP_BETA.1>
+   <PP_BETA.2
+       type="real"
+       size=" 602"
+       columns="4"
+       index="2"
+       angular_momentum="0"
+       cutoff_radius_index=" 152"
+       cutoff_radius="    1.5100000000E+00" >
+    0.0000000000E+00   -1.1723087215E-02   -2.2970588285E-02   -3.3277762775E-02
+   -4.2201369170E-02   -4.9329936250E-02   -5.4293469599E-02   -5.6772419538E-02
+   -5.6505748973E-02   -5.3297954463E-02   -4.7024911330E-02   -3.7638432778E-02
+   -2.5169454710E-02   -9.7297804400E-03    8.4876559486E-03    2.9210025055E-02
+    5.2087327705E-02    7.6696576823E-02    1.0254737811E-01    1.2908882978E-01
+    1.5571763119E-01    1.8178728239E-01    2.0661822255E-01    2.2950875795E-01
+    2.4974659019E-01    2.6662075364E-01    2.7943382524E-01    2.8751410085E-01
+    2.9022762150E-01    2.8698983512E-01    2.7727667838E-01    2.6063490065E-01
+    2.3669144662E-01    2.0516162976E-01    1.6585646477E-01    1.1868792319E-01
+    6.3673539750E-02    9.3865644130E-04   -6.9282296205E-02   -1.4664653237E-01
+   -2.3070525508E-01   -3.2090755014E-01   -4.1660590836E-01   -5.1706326588E-01
+   -6.2146146368E-01   -7.2891100660E-01   -8.3846198150E-01   -9.4911597183E-01
+   -1.0598387764E+00   -1.1695734189E+00   -1.2772547958E+00   -1.3818230208E+00
+   -1.4822368409E+00   -1.5774891053E+00   -1.6666187728E+00   -1.7487234696E+00
+   -1.8229735575E+00   -1.8886194365E+00   -1.9450060422E+00   -1.9915764523E+00
+   -2.0278831780E+00   -2.0535914046E+00   -2.0684822557E+00   -2.0724589319E+00
+   -2.0655426479E+00   -2.0478740211E+00   -2.0197115502E+00   -1.9814248238E+00
+   -1.9334892505E+00   -1.8764808008E+00   -1.8110656211E+00   -1.7379906968E+00
+   -1.6580733899E+00   -1.5721902388E+00   -1.4812642140E+00   -1.3862522622E+00
+   -1.2881324457E+00   -1.1878908963E+00   -1.0865088556E+00   -9.8494996663E-01
+   -8.8414798922E-01   -7.8499509134E-01   -6.8833085835E-01   -5.9493213952E-01
+   -5.0550383294E-01   -4.2067068746E-01   -3.4097017799E-01   -2.6684648767E-01
+   -1.9864560686E-01   -1.3661153627E-01   -8.0883449121E-02   -3.1493861467E-02
+    1.1620322833E-02    4.8599589644E-02    7.9650226738E-02    1.0503788587E-01
+    1.2508039801E-01    1.4013995079E-01    1.5061541504E-01    1.5693366067E-01
+    1.5954043230E-01    1.5889246200E-01    1.5544945597E-01    1.4966651822E-01
+    1.4199014613E-01    1.3284299597E-01    1.2262519169E-01    1.1170778401E-01
+    1.0042928223E-01    8.9095504256E-02    7.7963775928E-02    6.7255547029E-02
+    5.7152141457E-02    4.7799764163E-02    3.9294804857E-02    3.1701123437E-02
+    2.5050278343E-02    1.9344675342E-02    1.4551822257E-02    1.0621676019E-02
+    7.4866418173E-03    5.0616443312E-03    3.2542385947E-03    1.9673338834E-03
+    1.1012904675E-03    5.6303386455E-04    2.6418408232E-04    1.2416036016E-04
+    8.3638494781E-05    8.3445532246E-05    3.9712888830E-05   -3.5905440177E-06
+   -2.7996494097E-06    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00
+   </PP_BETA.2>
+   <PP_BETA.3
+       type="real"
+       size=" 602"
+       columns="4"
+       index="3"
+       angular_momentum="1"
+       cutoff_radius_index=" 152"
+       cutoff_radius="    1.5100000000E+00" >
+    0.0000000000E+00    3.5860269827E-03    1.4317078272E-02    3.2112256128E-02
+    5.6837367274E-02    8.8305873299E-02    1.2628021312E-01    1.7047348950E-01
+    2.2055150919E-01    2.7613516420E-01    3.3680313934E-01    4.0209492899E-01
+    4.7151414403E-01    5.4453208758E-01    6.2059157645E-01    6.9911098297E-01
+    7.7948847033E-01    8.6110639245E-01    9.4333582837E-01    1.0255412195E+00
+    1.1070850756E+00    1.1873327175E+00    1.2656570189E+00    1.3414431144E+00
+    1.4140930355E+00    1.4830302391E+00    1.5477039940E+00    1.6075935868E+00
+    1.6622123152E+00    1.7111112337E+00    1.7538826190E+00    1.7901631245E+00
+    1.8196365952E+00    1.8420365276E+00    1.8571481070E+00    1.8648098949E+00
+    1.8649150077E+00    1.8574119504E+00    1.8423049075E+00    1.8196536236E+00
+    1.7895728317E+00    1.7522312003E+00    1.7078498355E+00    1.6567003418E+00
+    1.5991024567E+00    1.5354212770E+00    1.4660641009E+00    1.3914769134E+00
+    1.3121405542E+00    1.2285667331E+00    1.1412932403E+00    1.0508797039E+00
+    9.5790298913E-01    8.6295169350E-01    7.6662172078E-01    6.6951129918E-01
+    5.7221537424E-01    4.7532181715E-01    3.7940524579E-01    2.8502383408E-01
+    1.9271369849E-01    1.0298539150E-01    1.6320224853E-02   -6.6834318537E-02
+   -1.4606648327E-01   -2.2100368547E-01   -2.9131520005E-01   -3.5671342275E-01
+   -4.1695549560E-01   -4.7184486654E-01   -5.2123135255E-01   -5.6501133337E-01
+   -6.0312746230E-01   -6.3556796384E-01   -6.6236527842E-01   -6.8359449970E-01
+   -6.9937142322E-01   -7.0985023488E-01   -7.1522090252E-01   -7.1570630926E-01
+   -7.1155917402E-01   -7.0305880711E-01   -6.9050775058E-01   -6.7422835456E-01
+   -6.5455934146E-01   -6.3185241011E-01   -6.0646893131E-01   -5.7877678564E-01
+   -5.4914739239E-01   -5.1795297687E-01   -4.8556415192E-01   -4.5234782509E-01
+   -4.1866194064E-01   -3.8484854594E-01   -3.5122924483E-01   -3.1810273720E-01
+   -2.8574272301E-01   -2.5439648427E-01   -2.2428377777E-01   -1.9559309794E-01
+   -1.6848457254E-01   -1.4308867914E-01   -1.1950650654E-01   -9.7810361217E-02
+   -7.8045864567E-02   -6.0228944123E-02   -4.4352023434E-02   -3.0384350338E-02
+   -1.8273960617E-02   -7.9495883994E-03    6.7759973479E-04    7.7099198058E-03
+    1.3261526160E-02    1.7457218324E-02    2.0427124578E-02    2.2305646179E-02
+    2.3229681315E-02    2.3338002873E-02    2.2762351834E-02    2.1631893993E-02
+    2.0072411241E-02    1.8198596607E-02    1.6115058374E-02    1.3919754929E-02
+    1.1698898123E-02    9.5228516845E-03    7.4546618226E-03    5.5452981103E-03
+    3.8288793415E-03    2.3331720374E-03    1.0736899984E-03    5.2926998136E-05
+   -7.3149996906E-04   -1.2929678601E-03   -1.6511600028E-03   -1.8297965622E-03
+   -1.8585360541E-03   -1.7678438403E-03   -1.5911002682E-03   -1.3586955246E-03
+   -1.1019690505E-03   -8.4702839256E-04   -6.1497715279E-04   -4.2541468309E-04
+   -2.8402093044E-04   -1.9725198792E-04   -1.6029137555E-04   -1.5177315345E-04
+   -8.9838576118E-05   -5.4771430827E-06    9.6147639048E-06    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00
+   </PP_BETA.3>
+   <PP_BETA.4
+       type="real"
+       size=" 602"
+       columns="4"
+       index="4"
+       angular_momentum="1"
+       cutoff_radius_index=" 152"
+       cutoff_radius="    1.5100000000E+00" >
+    0.0000000000E+00    9.2893242255E-04    3.7019764676E-03    8.2779769445E-03
+    1.4588689808E-02    2.2539302945E-02    3.2009163013E-02    4.2852705981E-02
+    5.4900588274E-02    6.7961014587E-02    8.1821257513E-02    9.6249363136E-02
+    1.1099603565E-01    1.2579669284E-01    1.4037368313E-01    1.5443865322E-01
+    1.6769505435E-01    1.7984077312E-01    1.9057087191E-01    1.9958042202E-01
+    2.0656741042E-01    2.1123570101E-01    2.1329802783E-01    2.1247899806E-01
+    2.0851807950E-01    2.0117254726E-01    1.9022036400E-01    1.7546296384E-01
+    1.5672791414E-01    1.3387142592E-01    1.0678068413E-01    7.5375969726E-02
+    3.9612545899E-02   -5.1771162367E-04   -4.4984990254E-02   -9.3720492346E-02
+   -1.4661576728E-01   -2.0352233418E-01   -2.6425172921E-01   -3.2857590175E-01
+   -3.9622799242E-01   -4.6690352048E-01   -5.4026196118E-01   -6.1592871460E-01
+   -6.9349745874E-01   -7.7253287529E-01   -8.5257373131E-01   -9.3313629426E-01
+   -1.0137180499E+00   -1.0938015836E+00   -1.1728591565E+00   -1.2503568716E+00
+   -1.3257591460E+00   -1.3985339592E+00   -1.4681572088E+00   -1.5341175221E+00
+   -1.5959215886E+00   -1.6530979289E+00   -1.7052026499E+00   -1.7518226198E+00
+   -1.7925806287E+00   -1.8271386946E+00   -1.8552014319E+00   -1.8765200508E+00
+   -1.8908940379E+00   -1.8981739659E+00   -1.8982634918E+00   -1.8911201167E+00
+   -1.8767560905E+00   -1.8552388890E+00   -1.8266905450E+00   -1.7912868237E+00
+   -1.7492558153E+00   -1.7008761570E+00   -1.6464740899E+00   -1.5864208426E+00
+   -1.5211291980E+00   -1.4510496961E+00   -1.3766664971E+00   -1.2984929459E+00
+   -1.2170668968E+00   -1.1329458601E+00   -1.0467020356E+00   -9.5891730044E-01
+   -8.7017821889E-01   -7.8107114433E-01   -6.9217748077E-01   -6.0406917196E-01
+   -5.1730448344E-01   -4.3242413992E-01   -3.4994793756E-01   -2.7037179974E-01
+   -1.9415879235E-01   -1.2172780433E-01   -5.3447524144E-02    1.0365478214E-02
+    6.9448141778E-02    1.2359143475E-01    1.7264050244E-01    2.1649715017E-01
+    2.5511519340E-01    2.8850025641E-01    3.1670773256E-01    3.3984027408E-01
+    3.5804463559E-01    3.7150996533E-01    3.8046133188E-01    3.8515667437E-01
+    3.8588276317E-01    3.8295243350E-01    3.7669486668E-01    3.6745451357E-01
+    3.5558583975E-01    3.4145309072E-01    3.2541361978E-01    3.0782259726E-01
+    2.8902885306E-01    2.6937446596E-01    2.4917322584E-01    2.2872577817E-01
+    2.0831749242E-01    1.8820101658E-01    1.6860103788E-01    1.4972254242E-01
+    1.3173967350E-01    1.1478834281E-01    9.8985070983E-02    8.4416837992E-02
+    7.1132442003E-02    5.9164183104E-02    4.8515945607E-02    3.9162908641E-02
+    3.1067102133E-02    2.4166226359E-02    1.8383581049E-02    1.3631066681E-02
+    9.8082692597E-03    6.8108066814E-03    4.5261412594E-03    2.8479629339E-03
+    1.6647468801E-03    8.7544647586E-04    3.8748703362E-04    1.0892680591E-04
+   -2.3425222903E-05   -7.7922079727E-05   -9.4002294959E-05   -9.5906384375E-05
+   -5.6988379155E-05   -3.5463910332E-06    5.9608986436E-06    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00
+   </PP_BETA.4>
+   <PP_DIJ type="real"  size="  16" columns="4">
+    1.9514303897E+01    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    2.7522534413E+00    0.0000000000E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00   -9.6137176497E+00    0.0000000000E+00
+    0.0000000000E+00    0.0000000000E+00    0.0000000000E+00   -3.2324794045E+00
+   </PP_DIJ>
+ </PP_NONLOCAL>
+ <PP_PSWFC>
+ </PP_PSWFC>
+ <PP_RHOATOM type="real"  size=" 602" columns="4">
+    0.0000000000E+00    2.4555322044E-04    9.9435596078E-04    2.2826802809E-03
+    4.1704577663E-03    6.7405039734E-03    1.0097446971E-02    1.4366374033E-02
+    1.9691214178E-02    2.6232877622E-02    3.4167176269E-02    4.3682552029E-02
+    5.4977642082E-02    6.8258712011E-02    8.3736989112E-02    1.0162592925E-01
+    1.2213845080E-01    1.4548416964E-01    1.7186666804E-01    2.0148082992E-01
+    2.3451027351E-01    2.7112491013E-01    3.1147865648E-01    3.5570732483E-01
+    4.0392671304E-01    4.5623091264E-01    5.1269085328E-01    5.7335309214E-01
+    6.3823886033E-01    7.0734337238E-01    7.8063540062E-01    8.5805711439E-01
+    9.3952417996E-01    1.0249261017E+00    1.1141268534E+00    1.2069656628E+00
+    1.3032581166E+00    1.4027973581E+00    1.5053555697E+00    1.6106855669E+00
+    1.7185225365E+00    1.8285859201E+00    1.9405813920E+00    2.0542029191E+00
+    2.1691348784E+00    2.2850542133E+00    2.4016326049E+00    2.5185386399E+00
+    2.6354399503E+00    2.7520052604E+00    2.8679065939E+00    2.9828210875E+00
+    3.0964328328E+00    3.2084348565E+00    3.3185305939E+00    3.4264354515E+00
+    3.5318783577E+00    3.6346027795E+00    3.7343681356E+00    3.8309505035E+00
+    3.9241436467E+00    4.0137595888E+00    4.0996291450E+00    4.1816023622E+00
+    4.2595486912E+00    4.3333571276E+00    4.4029361120E+00    4.4682134089E+00
+    4.5291358858E+00    4.5856689029E+00    4.6377959244E+00    4.6855179084E+00
+    4.7288525836E+00    4.7678337886E+00    4.8025102209E+00    4.8329448489E+00
+    4.8592138549E+00    4.8814055976E+00    4.8996195505E+00    4.9139652152E+00
+    4.9245610181E+00    4.9315332006E+00    4.9350147101E+00    4.9351441007E+00
+    4.9320644514E+00    4.9259223085E+00    4.9168666605E+00    4.9050479496E+00
+    4.8906171284E+00    4.8737247647E+00    4.8545202011E+00    4.8331507724E+00
+    4.8097610853E+00    4.7844923629E+00    4.7574818564E+00    4.7288623255E+00
+    4.6987615887E+00    4.6673022280E+00    4.6346013382E+00    4.6007695137E+00
+    4.5659116073E+00    4.5301262966E+00    4.4935060361E+00    4.4561370652E+00
+    4.4180998488E+00    4.3794678786E+00    4.3403092049E+00    4.3006862196E+00
+    4.2606559084E+00    4.2202701844E+00    4.1795755629E+00    4.1386143513E+00
+    4.0974246608E+00    4.0560406954E+00    4.0144930190E+00    3.9728092235E+00
+    3.9310140868E+00    3.8891296584E+00    3.8471763585E+00    3.8051727118E+00
+    3.7631353587E+00    3.7210801524E+00    3.6790222306E+00    3.6369754752E+00
+    3.5949532922E+00    3.5529695496E+00    3.5110371051E+00    3.4691686011E+00
+    3.4273777892E+00    3.3856773650E+00    3.3440802075E+00    3.3026002179E+00
+    3.2612500462E+00    3.2200431162E+00    3.1789931977E+00    3.1381126416E+00
+    3.0974153228E+00    3.0569141416E+00    3.0166213097E+00    2.9765501384E+00
+    2.9367121016E+00    2.8971193603E+00    2.8577833885E+00    2.8187144722E+00
+    2.7799237680E+00    2.7414203117E+00    2.7032138975E+00    2.6653129770E+00
+    2.6277254184E+00    2.5904592077E+00    2.5535203424E+00    2.5169161760E+00
+    2.4806515332E+00    2.4447325241E+00    2.4091633992E+00    2.3739487914E+00
+    2.3390925610E+00    2.3045979624E+00    2.2704685078E+00    2.2367061977E+00
+    2.2033142099E+00    2.1702934279E+00    2.1376466921E+00    2.1053739087E+00
+    2.0734775689E+00    2.0419567389E+00    2.0108135020E+00    1.9800463428E+00
+    1.9496568192E+00    1.9196429197E+00    1.8900057747E+00    1.8607428608E+00
+    1.8318549701E+00    1.8033391361E+00    1.7751957603E+00    1.7474216534E+00
+    1.7200166764E+00    1.6929775891E+00    1.6663036438E+00    1.6399916950E+00
+    1.6140403080E+00    1.5884465799E+00    1.5632083065E+00    1.5383229695E+00
+    1.5137875184E+00    1.4895999441E+00    1.4657562937E+00    1.4422551560E+00
+    1.4190916630E+00    1.3962646344E+00    1.3737695287E+00    1.3516041505E+00
+    1.3297648393E+00    1.3082482527E+00    1.2870517468E+00    1.2661707761E+00
+    1.2456035241E+00    1.2253449792E+00    1.2053930652E+00    1.1857436665E+00
+    1.1663933299E+00    1.1473392986E+00    1.1285766929E+00    1.1101036434E+00
+    1.0919153412E+00    1.0740090637E+00    1.0563813566E+00    1.0390278824E+00
+    1.0219464881E+00    1.0051322370E+00    9.8858273586E-01    9.7229433753E-01
+    9.5626288454E-01    9.4048618717E-01    9.2495945205E-01    9.0968023104E-01
+    8.9464515722E-01    8.7984988668E-01    8.6529242529E-01    8.5096836827E-01
+    8.3687483992E-01    8.2300917666E-01    8.0936653060E-01    7.9594527075E-01
+    7.8274162686E-01    7.6975194584E-01    7.5697415488E-01    7.4440404132E-01
+    7.3203906033E-01    7.1987668706E-01    7.0791241321E-01    6.9614463944E-01
+    6.8457015681E-01    6.7318513862E-01    6.6198789975E-01    6.5097491802E-01
+    6.4014312669E-01    6.2949060514E-01    6.1901366286E-01    6.0870987684E-01
+    5.9857713213E-01    5.8861165082E-01    5.7881152804E-01    5.6917450923E-01
+    5.5969679766E-01    5.5037688431E-01    5.4121243428E-01    5.3219968832E-01
+    5.2333742019E-01    5.1462325731E-01    5.0605357159E-01    4.9762726943E-01
+    4.8934201433E-01    4.8119433956E-01    4.7318319951E-01    4.6530638099E-01
+    4.5756049431E-01    4.4994457445E-01    4.4245651371E-01    4.3509310405E-01
+    4.2785330791E-01    4.2073518522E-01    4.1373575057E-01    4.0685380826E-01
+    4.0008762412E-01    3.9343446965E-01    3.8689291717E-01    3.8046146824E-01
+    3.7413768570E-01    3.6791984300E-01    3.6180669955E-01    3.5579614205E-01
+    3.4988609008E-01    3.4407557281E-01    3.3836282768E-01    3.3274538488E-01
+    3.2722254123E-01    3.2179272713E-01    3.1645359987E-01    3.1120418021E-01
+    3.0604318897E-01    3.0096867591E-01    2.9597910956E-01    2.9107357482E-01
+    2.8625054370E-01    2.8150791508E-01    2.7684511501E-01    2.7226081425E-01
+    2.6775306980E-01    2.6332090390E-01    2.5896332932E-01    2.5467885571E-01
+    2.5046582167E-01    2.4632363995E-01    2.4225111660E-01    2.3824645700E-01
+    2.3430890125E-01    2.3043753190E-01    2.2663103075E-01    2.2288787866E-01
+    2.1920758650E-01    2.1558909889E-01    2.1203087347E-01    2.0853209698E-01
+    2.0509203703E-01    2.0170965015E-01    1.9838332245E-01    1.9511274762E-01
+    1.9189700637E-01    1.8873489405E-01    1.8562532983E-01    1.8256784638E-01
+    1.7956156317E-01    1.7660516721E-01    1.7369802148E-01    1.7083951993E-01
+    1.6802882312E-01    1.6526455844E-01    1.6254643724E-01    1.5987374760E-01
+    1.5724562162E-01    1.5466088112E-01    1.5211928277E-01    1.4962011532E-01
+    1.4716247731E-01    1.4474543496E-01    1.4236869347E-01    1.4003158209E-01
+    1.3773319599E-01    1.3547277982E-01    1.3325001085E-01    1.3106425897E-01
+    1.2891464189E-01    1.2680052175E-01    1.2472157135E-01    1.2267720085E-01
+    1.2066657246E-01    1.1868911133E-01    1.1674450739E-01    1.1483221045E-01
+    1.1295144579E-01    1.1110165375E-01    1.0928256040E-01    1.0749365423E-01
+    1.0573423961E-01    1.0400373057E-01    1.0230190533E-01    1.0062828986E-01
+    9.8982281391E-02    9.7363232570E-02    9.5770986299E-02    9.4205104640E-02
+    9.2665089253E-02    9.1150203181E-02    8.9660362635E-02    8.8195164240E-02
+    8.6754184314E-02    8.5336692055E-02    8.3942565008E-02    8.2571471377E-02
+    8.1223022254E-02    7.9896618189E-02    7.8591960933E-02    7.7308833712E-02
+    7.6046880544E-02    7.4805642086E-02    7.3584629176E-02    7.2383740506E-02
+    7.1202651131E-02    7.0041020775E-02    6.8898242541E-02    6.7774247481E-02
+    6.6668765461E-02    6.5581486791E-02    6.4511958509E-02    6.3459869402E-02
+    6.2425088961E-02    6.1407336085E-02    6.0406316994E-02    5.9421479885E-02
+    5.8452822608E-02    5.7500090972E-02    5.6563019041E-02    5.5641218020E-02
+    5.4734411156E-02    5.3842496378E-02    5.2965233586E-02    5.2102372240E-02
+    5.1253457601E-02    5.0418435755E-02    4.9597119611E-02    4.8789283548E-02
+    4.7994643509E-02    4.7212852717E-02    4.6443878327E-02    4.5687517686E-02
+    4.4943559586E-02    4.4211678082E-02    4.3491681375E-02    4.2783483184E-02
+    4.2086894267E-02    4.1401717619E-02    4.0727603994E-02    4.0064476856E-02
+    3.9412211446E-02    3.8770631668E-02    3.8139554396E-02    3.7518622579E-02
+    3.6907839114E-02    3.6307055480E-02    3.5716108368E-02    3.5134814385E-02
+    3.4562862225E-02    3.4000262270E-02    3.3446869273E-02    3.2902532276E-02
+    3.2367070616E-02    3.1840217984E-02    3.1321976903E-02    3.0812213425E-02
+    3.0310788441E-02    2.9817533657E-02    2.9332204905E-02    2.8854805910E-02
+    2.8385213523E-02    2.7923299942E-02    2.7468917225E-02    2.7021823958E-02
+    2.6582032748E-02    2.6149430718E-02    2.5723900801E-02    2.5305321709E-02
+    2.4893439431E-02    2.4488280997E-02    2.4089743440E-02    2.3697719832E-02
+    2.3312099450E-02    2.2932666819E-02    2.2559404233E-02    2.2192239239E-02
+    2.1831074449E-02    2.1475809062E-02    2.1126270265E-02    2.0782385275E-02
+    2.0444115841E-02    2.0111373508E-02    1.9784066758E-02    1.9462068697E-02
+    1.9145243997E-02    1.8833590598E-02    1.8527028380E-02    1.8225474479E-02
+    1.7928843269E-02    1.7636952905E-02    1.7349817563E-02    1.7067371509E-02
+    1.6789539913E-02    1.6516245476E-02    1.6247359808E-02    1.5982811518E-02
+    1.5722581767E-02    1.5466603166E-02    1.5214806114E-02    1.4967116953E-02
+    1.4723378333E-02    1.4483616032E-02    1.4247769523E-02    1.4015776305E-02
+    1.3787571887E-02    1.3563051106E-02    1.3342146415E-02    1.3124847757E-02
+    1.2911099151E-02    1.2700842848E-02    1.2494019314E-02    1.2290502671E-02
+    1.2090293497E-02    1.1893354059E-02    1.1699632778E-02    1.1509076483E-02
+    1.1321619331E-02    1.1137157247E-02    1.0955705606E-02    1.0777218459E-02
+    1.0601648447E-02    1.0428946789E-02    1.0259036651E-02    1.0091856576E-02
+    9.9274061406E-03    9.7656432701E-03    9.6065246281E-03    9.4500056081E-03
+    9.2960048318E-03    9.1444890831E-03    8.9954481332E-03    8.8488435842E-03
+    8.7046359123E-03    8.5627844608E-03    8.4232085626E-03    8.2858909371E-03
+    8.1508168888E-03    8.0179514903E-03    7.8872588105E-03    7.7587019092E-03
+    7.6322050317E-03    7.5077566051E-03    7.3853419925E-03    7.2649295244E-03
+    7.1464866384E-03    7.0299798746E-03    6.9153417240E-03    6.8025579795E-03
+    6.6916177587E-03    6.5824924357E-03    6.4751525918E-03    6.3695680107E-03
+    6.2656820661E-03    6.1634712357E-03    6.0629312937E-03    5.9640364449E-03
+    5.8667601907E-03    5.7710753252E-03    5.6769381537E-03    5.5843106621E-03
+    5.4931974191E-03    5.4035752507E-03    5.3154203599E-03    5.2287083230E-03
+    5.1434098717E-03    5.0594687788E-03    4.9768997803E-03    4.8956821190E-03
+    4.8157944869E-03    4.7372150220E-03    4.6599213051E-03    4.5838621307E-03
+    4.5090375337E-03    4.4354374536E-03    4.3630428668E-03    4.2918342606E-03
+    4.2217916304E-03    4.1528822007E-03    4.0850770685E-03    4.0183813155E-03
+    3.9527780072E-03    3.8882497776E-03    3.8247788275E-03    3.7623469216E-03
+    3.7009082742E-03    3.6404704082E-03    3.5810223207E-03    3.5225486012E-03
+    3.4650334570E-03    3.4084607124E-03    3.3528049178E-03    3.2980391006E-03
+    3.2441697846E-03    3.1911833332E-03    3.1390657742E-03    3.0878027987E-03
+    3.0373797587E-03    2.9877641034E-03    2.9389511875E-03    2.8909379581E-03
+    2.8437120887E-03    2.7972609573E-03    2.7515716446E-03    2.7066309326E-03
+    2.6624026263E-03    2.6188968441E-03    2.5761042605E-03    2.5340137731E-03
+    2.4926140192E-03    2.4518933742E-03    2.4118369488E-03    2.3724166399E-03
+    2.3336420418E-03    2.2955034166E-03    2.2579907985E-03    2.2210939931E-03
+    2.1848025759E-03    2.1491007974E-03    2.1139679998E-03    2.0794109580E-03
+    2.0454209566E-03    2.0119890803E-03    1.9791062132E-03    1.9467630375E-03
+    1.9149447857E-03    1.8836339902E-03    1.8528364927E-03    1.8225445192E-03
+    1.7927501202E-03    1.7634451708E-03    1.7346213692E-03    1.7062663468E-03
+    1.6783624620E-03    1.6509161204E-03    1.6239204100E-03    1.5973682657E-03
+    1.5712524689E-03    1.5455656462E-03
+ </PP_RHOATOM>
+</UPF>
diff --git a/tests/integrate/test.sum b/tests/integrate/test.sum
new file mode 100644
index 00000000000..8b137891791
--- /dev/null
+++ b/tests/integrate/test.sum
@@ -0,0 +1 @@
+
diff --git a/tests/integrate/tools/catch_properties.sh b/tests/integrate/tools/catch_properties.sh
index 859d35309fc..46d47fda74b 100755
--- a/tests/integrate/tools/catch_properties.sh
+++ b/tests/integrate/tools/catch_properties.sh
@@ -113,6 +113,7 @@ word_total_time="atomic_world"
 symmetry=$(get_input_key_value "symmetry" "INPUT")
 out_current=$(get_input_key_value "out_current" "INPUT")
 nspin=$(get_input_key_value "nspin" "INPUT")
+has_ds=$(get_input_key_value "sc_mag_switch" "INPUT")
 test -e $1 && rm $1
 
 #------------------------------------------------------------
@@ -157,7 +158,7 @@ fi
 # echo "has_stress:"$has_stress
 #-------------------------------
 if ! test -z "$has_stress" && [  $has_stress == 1 ]; then
-    grep -A6 "TOTAL-STRESS" $running_path| awk 'NF==3' | tail -3> stress.txt
+    grep -A6 "TOTAL-STRESS" $running_path| awk '/^[[:space:]]*-?[0-9]/' | tail -3> stress.txt
 	total_stress=`sum_file stress.txt`
 	rm stress.txt
 	echo "totalstressref $total_stress" >>$1
@@ -276,7 +277,7 @@ fi
 if ! test -z "$has_band"  && [  $has_band == 1 ]; then
 	bandref=band.txt.ref
 	bandcal=OUT.autotest/band.txt
-	python3 $COMPARE_SCRIPT $bandref $bandcal 8
+	python3 $COMPARE_SCRIPT $bandref $bandcal 5
 	echo "CompareBand_pass $?" >>$1
 fi
 
@@ -793,6 +794,64 @@ if ! test -z "$out_alllog" && [ $out_alllog -eq 1 ]; then
     fi
 fi
 
+#--------------------------------------------
+# DeltaSpin: atomic magnetic moments and lambda
+# Extract final after-optimization values from log
+#--------------------------------------------
+if ! test -z "$has_ds" && [ "$has_ds" == 1 ]; then
+    # Extract the last "after-optimization spin" block (final converged values)
+    # The block starts with "after-optimization spin" header and contains ATOM lines
+    # We need to find the last occurrence before "Inner optimization for lambda ends"
+    
+    # Get the line number of the last "after-optimization spin" header
+    last_spin_line=$(grep -n "after-optimization spin (uB)" "$running_path" | tail -1 | cut -d: -f1)
+    last_lambda_line=$(grep -n "after-optimization lambda (eV/uB)" "$running_path" | tail -1 | cut -d: -f1)
+    
+    if [ ! -z "$last_spin_line" ]; then
+        # Extract ATOM lines after the last "after-optimization spin" header
+        # Read until we hit a non-ATOM line (typically "Inner optimization")
+        spin_values=$(sed -n "$((last_spin_line + 1)),\$p" "$running_path" | awk '/^ATOM/{print; next} /^[^A]/{exit}')
+        
+        # Sum up x, y, z components for each atom and compute RMS deviation from target
+        if [ "$nspin" == 2 ]; then
+            # nspin=2: only z component
+            echo "$spin_values" | awk 'BEGIN{sum=0; n=0} /^ATOM/{sum+=$3*$3; n++} END{if(n>0) printf "%.10f\n", sqrt(sum/n)}' > magmom_rms.txt
+            magmom_rms=$(cat magmom_rms.txt)
+            if [ ! -z "$magmom_rms" ]; then
+                echo "ds_magmom_rmsref $magmom_rms" >>$1
+            fi
+            rm -f magmom_rms.txt
+        elif [ "$nspin" == 4 ]; then
+            # nspin=4: x, y, z components
+            echo "$spin_values" | awk 'BEGIN{sum=0; n=0} /^ATOM/{sum+=($3*$3+$4*$4+$5*$5); n++} END{if(n>0) printf "%.10f\n", sqrt(sum/n)}' > magmom_rms.txt
+            magmom_rms=$(cat magmom_rms.txt)
+            if [ ! -z "$magmom_rms" ]; then
+                echo "ds_magmom_rmsref $magmom_rms" >>$1
+            fi
+            rm -f magmom_rms.txt
+        fi
+        
+        # Extract individual atom magnetic moment magnitudes
+        echo "$spin_values" | awk '/^ATOM/{
+            if(NF>=5) {mag=sqrt($3*$3+$4*$4+$5*$5)}
+            else {mag=$3}
+            printf "ds_magmom_atom%dref %.10f\n", $2, mag
+        }' >>$1
+    fi
+    
+    if [ ! -z "$last_lambda_line" ]; then
+        # Extract ATOM lines after the last "after-optimization lambda" header
+        lambda_values=$(sed -n "$((last_lambda_line + 1)),\$p" "$running_path" | awk '/^ATOM/{print; next} /^[^A]/{exit}')
+        
+        # Extract individual atom lambda magnitudes
+        echo "$lambda_values" | awk '/^ATOM/{
+            if(NF>=5) {lam=sqrt($3*$3+$4*$4+$5*$5)}
+            else {lam=$3}
+            printf "ds_lambda_atom%dref %.10f\n", $2, lam
+        }' >>$1
+    fi
+fi
+
 #--------------------------------------------
 # Check time information 
 #--------------------------------------------