deepmodeling · dyzheng · May 1, 2026 · May 1, 2026 · May 1, 2026 · May 1, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -237,9 +237,16 @@ jobs:
         run: |
           ctest --test-dir build -V --timeout 1700 -R 10_others
 
+      - name: 17_DS_DFTU Test
+        env:
+          GTEST_COLOR: 'yes'
+          OMP_NUM_THREADS: '2'
+        run: |
+          ctest --test-dir build -V --timeout 1700 -R 17_DS_DFTU
+
       - name: Other Unittests
         env:
           GTEST_COLOR: 'yes'
           OMP_NUM_THREADS: '2'
         run: |
-          ctest --test-dir build -V --timeout 1700 -E 'integrate_test|01_PW|02_NAO_Gamma|03_NAO_multik|04_FF|05_rtTDDFT|06_SDFT|07_OFDFT|08_EXX|09_DeePKS|10_others|11_PW_GPU|12_NAO_Gamma_GPU|13_NAO_multik_GPU|15_rtTDDFT_GPU|16_SDFT_GPU|MODULE_BASE|MODULE_IO|MODULE_HSOLVER|MODULE_CELL|MODULE_MD|MODULE_PSI|MODULE_ESTATE|MODULE_RI|MODULE_HAMILT|MODULE_PW|MODULE_LCAO|MODULE_AO|MODULE_NAO|MODULE_RELAX|MODULE_LR'
+          ctest --test-dir build -V --timeout 1700 -E 'integrate_test|01_PW|02_NAO_Gamma|03_NAO_multik|04_FF|05_rtTDDFT|06_SDFT|07_OFDFT|08_EXX|09_DeePKS|10_others|11_PW_GPU|12_NAO_Gamma_GPU|13_NAO_multik_GPU|15_rtTDDFT_GPU|16_SDFT_GPU|17_DS_DFTU|MODULE_BASE|MODULE_IO|MODULE_HSOLVER|MODULE_CELL|MODULE_MD|MODULE_PSI|MODULE_ESTATE|MODULE_RI|MODULE_HAMILT|MODULE_PW|MODULE_LCAO|MODULE_AO|MODULE_NAO|MODULE_RELAX|MODULE_LR'
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
@@ -394,6 +394,8 @@
     - [sccut](#sccut)
     - [sc\_drop\_thr](#sc_drop_thr)
     - [sc\_scf\_thr](#sc_scf_thr)
+    - [sc\_direction\_only](#sc_direction_only)
+    - [sc\_lambda\_strategy](#sc_lambda_strategy)
   - [vdW correction](#vdw-correction)
     - [vdw\_method](#vdw_method)
     - [vdw\_s6](#vdw_s6)
@@ -3481,8 +3483,8 @@
 
 - **Type**: Integer
 - **Description**: Determines whether to calculate the plus U correction, which is especially important for correlated electrons.
-  - 1: Calculate plus U correction with radius-adjustable localized projections (with parameter onsite_radius).
-  - 2: Calculate plus U correction using first zeta of NAOs as projections (this is old method for testing).
+  - 1: Calculate plus U correction with radius-adjustable localized projections (with parameter onsite_radius). Supported for both PW and LCAO basis sets.
+  - 2: Calculate plus U correction using first zeta of NAOs as projections (this is old method for testing). Only available for LCAO basis.
   - 0: Do not calculate plus U correction.
 - **Default**: 0
 
@@ -3629,6 +3631,24 @@
 - **Description**: Density error threshold for inner loop of spin-constrained SCF
 - **Default**: 1.0e-4
 
+### sc_direction_only
+
+- **Type**: Boolean
+- **Availability**: *sc_mag_switch is true*
+- **Description**: When true, only the direction of the magnetic moment is constrained to the target direction, while the magnitude is allowed to vary freely. This is useful for studying magnetic anisotropy or when the magnitude of the moment is determined by the electronic structure rather than an external constraint. When false (default), both the direction and magnitude of the magnetic moment are constrained to the target values.
+- **Default**: False
+
+### sc_lambda_strategy
+
+- **Type**: String
+- **Availability**: *sc_mag_switch is true*
+- **Description**: Lambda update strategy for spin-constrained DFT. Available options are:
+  - bfgs: BFGS quasi-Newton method (default, robust and well-tested)
+  - linear_response: linear response method (Scheme B)
+  - augmented_lagrangian: augmented Lagrangian method (Scheme C)
+  - hybrid_delayed: hybrid delayed update (Scheme D)
+- **Default**: bfgs
+
 [back to top](#full-list-of-input-keywords)
 
 ## vdW correction

diff --git a/docs/advanced/scf/construct_H.md b/docs/advanced/scf/construct_H.md
@@ -77,6 +77,6 @@ Here, we use a simple [example calculation](https://github.com/deepmodeling/abac
 
 Conventional functionals, e.g., L(S)DA and GGAs, encounter failures in strongly correlated systems, usually characterized by partially filled *d*/*f* shells. These include transition metals (TM) and their oxides, rare-earth compounds, and actinides, to name a few, where L(S)DA/GGAs typically yield quantitatively or even qualitatively wrong results. To address this failure, an efficient and successful method named DFT+*U*, which inherits the efficiency of L(S)DA/GGA but gains the strength of the Hubbard model in describing the physics of strongly correlatedsystems, has been developed.
 
-Now the DFT+*U* method is accessible in ABACUS. The details of the DFT+*U* method could be found in this [paper](https://doi.org/10.1063/5.0090122). It should be noted that the DFT+*U* works only within the NAO scheme, which means that the value of the keyword `basis_type` must be lcao when DFT+*U* is called. To turn on DFT+*U*, users need to set the value of the `dft_plus_u` keyword in the `INPUT` file to be 1. All relevant parmeters used in DFT+*U* calculations are listed in the [DFT+*U* correction](../input_files/input-main.md#dftu-correction) part of the [list of keywords](../input_files/input-main.md).
+Now the DFT+*U* method is accessible in ABACUS. The details of the DFT+*U* method could be found in this [paper](https://doi.org/10.1063/5.0090122). DFT+*U* is supported for both LCAO (`basis_type = lcao`) and plane-wave (`basis_type = pw`) basis sets. For the PW basis, `dft_plus_u = 1` (radius-adjustable localized projections) is supported with `nspin = 1`, `2`, or `4`. For the LCAO basis, both `dft_plus_u = 1` and `dft_plus_u = 2` are available. To turn on DFT+*U*, users need to set the value of the `dft_plus_u` keyword in the `INPUT` file to be 1. All relevant parameters used in DFT+*U* calculations are listed in the [DFT+*U* correction](../input_files/input-main.md#dftu-correction) part of the [list of keywords](../input_files/input-main.md).
 
 Examples of DFT+*U* calculations are provided in this [directory](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/dft_plus_u).
diff --git a/docs/advanced/scf/spin.md b/docs/advanced/scf/spin.md
@@ -28,6 +28,224 @@ If **"ocp=1"** and **"ocp_set"** is set in INPUT file, the occupations of states
 2. **"nupdown"**
 If **"nupdown"** is set to non-zero, number of spin-up and spin-down electrons will be fixed, and Fermi energy level will split to E_Fermi_up and E_Fermi_down. By the way, total magnetization will also be fixed, and will be the value of **"nupdown"**.
 
+## DeltaSpin (Spin-Constrained DFT)
+
+DeltaSpin is a spin-constrained DFT method that allows users to constrain the magnetic moments on individual atoms to target values during self-consistent field (SCF) calculations. This is useful for studying magnetic excitations, non-collinear magnetic structures, and systems where the magnetic ground state is not known a priori.
+
+The theoretical foundation and implementation details can be found in:
+
+- Cai Z, Wang K, Xu Y, et al., "A self-adaptive first-principles approach for magnetic excited states," *Quantum Frontiers* 2.1 (2023): 21. [DOI: 10.1007/s44214-023-00050-z](https://doi.org/10.1007/s44214-023-00050-z)
+- Zheng D, Peng X, Huang Y, et al., "Integrating deep-learning-based magnetic model and non-collinear spin-constrained method: methodology, implementation and application," *npj Computational Materials* (2026).
+
+### Enabling DeltaSpin
+
+Set `sc_mag_switch 1` in the INPUT file. DeltaSpin is supported for both PW (`basis_type = pw`) and LCAO (`basis_type = lcao`) basis sets, with `nspin = 2` (collinear) or `nspin = 4` (non-collinear).
+
+### Specifying Target Magnetic Moments in STRU
+
+Target magnetic moments and constraint flags are specified per atom in the `ATOMIC_POSITIONS` section of the STRU file, using the `mag` (or `magmom`), `sc`, `lambda`, `angle1`, and `angle2` keywords after the atomic coordinates.
+
+#### Collinear (nspin=2)
+
+For collinear spin, only the z-component of the magnetic moment is constrained:
+
+```
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0   sc 1
+0.51  0.51  0.51  mag  -2.0  sc 1
+```
+
+- `mag 2.0`: target magnetic moment of 2.0 $\mu_B$ along z-axis
+- `sc 1`: constrain the z-component (1 = constrained, 0 = unconstrained)
+
+#### Non-collinear (nspin=4), vector form
+
+For non-collinear spin, specify the magnetic moment as a vector (mx, my, mz):
+
+```
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0  0.0  0.0  sc 1 1 1
+0.51  0.51  0.51  mag  0.0  0.0  -2.0  sc 1 1 1
+```
+
+- `mag 2.0 0.0 0.0`: target moment vector in Cartesian coordinates ($\mu_B$)
+- `sc 1 1 1`: constrain x, y, z components respectively
+
+#### Non-collinear (nspin=4), angle form
+
+Alternatively, use `angle1` (polar angle $\theta$) and `angle2` (azimuthal angle $\phi$) in degrees to specify the direction:
+
+```
+0.00  0.00  0.00  mag 2.0  angle1 0  angle2 0    sc 1 1 1
+0.51  0.51  0.51  mag 2.0  angle1 180  angle2 0  sc 1 1 1
+```
+
+The Cartesian components are computed as:
+- $m_z = |\mathbf{m}| \cos\theta$
+- $m_x = |\mathbf{m}| \sin\theta \cos\phi$
+- $m_y = |\mathbf{m}| \sin\theta \sin\phi$
+
+#### Providing initial Lagrange multipliers
+
+Initial lambda values (in eV/$\mu_B$) can be provided via the `lambda` keyword to accelerate convergence:
+
+```
+0.00  0.00  0.00  mag 2.0  lambda 0.01 0.0 0.0  sc 1 1 1
+```
+
+A single value sets $\lambda_z$; three values set $\lambda_x$, $\lambda_y$, $\lambda_z$.
+
+#### Partial constraints
+
+Set `sc 0` for unconstrained components. For example, to constrain only the direction but not the magnitude (use with `sc_direction_only`):
+
+```
+0.00  0.00  0.00  mag 2.0  0.0  0.0  sc 1 1 0
+```
+
+### DeltaSpin INPUT Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `sc_mag_switch` | Boolean | False | Enable DeltaSpin |
+| `sc_thr` | Real | 1.0e-6 | Convergence criterion for lambda loop (RMS, in $\mu_B$) |
+| `nsc` | Integer | 100 | Maximum number of lambda iterations |
+| `nsc_min` | Integer | 2 | Minimum number of lambda iterations |
+| `sc_scf_nmin` | Integer | 2 | Minimum outer SCF iterations before starting lambda loop |
+| `alpha_trial` | Real | 0.01 | Initial trial step size for lambda (eV/$\mu_B^2$) |
+| `sccut` | Real | 3.0 | Maximum step size for lambda (eV/$\mu_B$) |
+| `sc_drop_thr` | Real | 1.0e-2 | Convergence ratio threshold for adaptive lambda loop |
+| `sc_scf_thr` | Real | 1.0e-4 | Density error threshold for entering lambda loop |
+| `sc_direction_only` | Boolean | False | Constrain only the direction, not the magnitude |
+| `sc_lambda_strategy` | String | bfgs | Lambda update strategy (see below) |
+| `decay_grad_switch` | Boolean | False | Enable gradient-based early exit |
+
+For full parameter details, see the [Spin-Constrained DFT](../input_files/input-main.md#spin-constrained-dft) section of the input keyword list.
+
+### Lambda Update Strategies
+
+The `sc_lambda_strategy` parameter controls how the Lagrange multipliers $\lambda$ are updated during the lambda loop:
+
+- **`bfgs`** (default): BFGS quasi-Newton method with line search. Robust and well-tested for both PW and LCAO. Uses `alpha_trial` and `sccut` to control step size.
+
+- **`linear_response`**: Linear response method (Scheme B). Estimates the magnetic susceptibility $\chi$ from the history of $(\lambda, M)$ pairs and performs a one-step Newton-like update: $\Delta\lambda = \beta (M_{\text{target}} - M) / \chi$, where $\beta$ is a mixing parameter.
+
+- **`augmented_lagrangian`**: Augmented Lagrangian method (Scheme C). Uses a penalty parameter $\mu$ that grows over iterations: $\lambda_{\text{new}} = \lambda + \mu (M - M_{\text{target}})$. The penalty increases until convergence is achieved.
+
+- **`hybrid_delayed`**: Hybrid delayed update (Scheme D). Two-phase approach: in the early phase (SCF not yet converged), lambda updates are gentle; in the late phase (SCF nearly converged), augmented Lagrangian updates are applied.
+
+### Direction-Only Mode
+
+When `sc_direction_only 1` is set, only the **direction** of the magnetic moment is constrained to match the target, while the magnitude is allowed to vary freely. This is useful for:
+
+- Studying magnetic anisotropy energy surfaces
+- Cases where the moment magnitude is determined by the electronic structure
+- Converging to the easy-axis direction without fixing the moment size
+
+In this mode, the lambda vector is projected to be perpendicular to the target moment direction at each iteration, ensuring it can only rotate the magnetization, not stretch it.
+
+### Combining DeltaSpin with DFT+U
+
+DeltaSpin can be combined with DFT+U for strongly correlated systems. When both `sc_mag_switch` and `dft_plus_u` are enabled:
+
+1. DFT+U occupation update runs first in each SCF iteration
+2. DeltaSpin lambda loop runs after, constraining the magnetic moments
+3. The DFT+U-corrected Hamiltonian is used by the lambda loop
+
+Example INPUT for PW DFT+U + DeltaSpin:
+
+```
+INPUT_PARAMETERS
+calculation         scf
+basis_type          pw
+ecutwfc             50
+nspin               2
+dft_plus_u          1
+orbital_corr        -1 2
+hubbard_u           0.0 4.0
+sc_mag_switch       1
+sc_thr              1.0e-6
+sc_scf_thr          1.0e-4
+sc_lambda_strategy  bfgs
+```
+
+### Example: Collinear antiferromagnetic Fe
+
+INPUT file:
+
+```
+INPUT_PARAMETERS
+calculation         scf
+basis_type          pw
+ecutwfc             50
+nspin               2
+sc_mag_switch       1
+sc_thr              1.0e-6
+```
+
+STRU file:
+
+```
+ATOMIC_SPECIES
+Fe 55.845 Fe.upf
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00  0.50  0.50
+ 0.50  1.00  0.50
+ 0.50  0.50  1.00
+
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0  sc 1
+0.51  0.51  0.51  mag  -2.0  sc 1
+```
+
+### Example: Non-collinear constrained moments
+
+INPUT file:
+
+```
+INPUT_PARAMETERS
+calculation         scf
+basis_type          pw
+ecutwfc             50
+nspin               4
+noncolin            1
+sc_mag_switch       1
+sc_direction_only   1
+sc_lambda_strategy  bfgs
+```
+
+STRU file:
+
+```
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00  0.00  0.00  mag  2.0  0.0  0.0  sc 1 1 0
+0.51  0.51  0.51  mag  0.0  0.0  2.0  sc 1 1 0
+```
+
 ## Noncollinear Spin Polarized Calculations
 The spin non-collinear polarization calculation corresponds to setting **"noncolin 1"**, in which case the coupling between spin up and spin down will be taken into account.
 In this case, nspin is automatically set to 4, which is usually not required to be specified manually.

diff --git a/docs/parameters.yaml b/docs/parameters.yaml
@@ -4266,6 +4266,26 @@ parameters:
     default_value: "1.0e-4"
     unit: ""
     availability: sc_mag_switch is true
+  - name: sc_direction_only
+    category: Spin-Constrained DFT
+    type: Boolean
+    description: |
+      When true, only the direction of the magnetic moment is constrained to the target direction, while the magnitude is allowed to vary freely. This is useful for studying magnetic anisotropy or when the magnitude of the moment is determined by the electronic structure rather than an external constraint. When false (default), both the direction and magnitude of the magnetic moment are constrained to the target values.
+    default_value: "False"
+    unit: ""
+    availability: sc_mag_switch is true
+  - name: sc_lambda_strategy
+    category: Spin-Constrained DFT
+    type: String
+    description: |
+      Lambda update strategy for spin-constrained DFT. Available options are:
+      * bfgs: BFGS quasi-Newton method (default, robust and well-tested)
+      * linear_response: linear response method (Scheme B)
+      * augmented_lagrangian: augmented Lagrangian method (Scheme C)
+      * hybrid_delayed: hybrid delayed update (Scheme D)
+    default_value: "bfgs"
+    unit: ""
+    availability: sc_mag_switch is true
   - name: qo_switch
     category: Quasiatomic Orbital (QO) analysis
     type: Boolean

diff --git a/interfaces/Wannier90_interface/examples_python/example_pw.py b/interfaces/Wannier90_interface/examples_python/example_pw.py
@@ -64,7 +64,7 @@ def main():
     # 3. Dependency files (PW only needs pseudopotentials)
     # ----------------------------------------------------------
     job.pp_orbitals = {"Bi": "../../../tests/PP_ORB/Bi_pbe_fr.upf", "Se": "../../../tests/PP_ORB/Se_pbe_fr.upf"}
-    # ← PW基组不需要轨道文件，不设置 orbital_files
+    # PW basis does not require orbital files, orbital_files not set
 
     # ----------------------------------------------------------
     # 4. Wannier90 Parameters
@@ -99,7 +99,7 @@ def main():
         if DRY_RUN:
             job._validate_inputs()
 
-            # Step 0: SCF (PW基组)
+            # Step 0: SCF (PW basis)
             job.step0_run_scf(
                 scf_mp_grid=[4, 4, 4],
             )

diff --git a/source/source_base/kernels/cuda/math_kernel_op.cu b/source/source_base/kernels/cuda/math_kernel_op.cu
@@ -314,6 +314,9 @@ void gemm_op<std::complex<double>, base_device::DEVICE_GPU>::operator()(const ch
 {
     cublasOperation_t cutransA = judge_trans_op(true, transa, "gemm_op");
     cublasOperation_t cutransB = judge_trans_op(true, transb, "gemm_op");
+    if (cublas_handle == nullptr) {
+        CHECK_CUBLAS(cublasCreate(&cublas_handle));
+    }
     CHECK_CUBLAS(cublasZgemm(cublas_handle, cutransA, cutransB, m, n ,k, (double2*)alpha, (double2*)a , lda, (double2*)b, ldb, (double2*)beta, (double2*)c, ldc));
 }
 

diff --git a/source/source_base/main.cpp b/source/source_base/main.cpp
@@ -36,7 +36,7 @@ void calculate()
 /*
 	time_t time_start = std::time(NULL);
 
-//	ModuleBase::timer::start();
+//	ModuleBase::timer::tick();
 
 	//----------------------------------------------------------
 	// main program for doing electronic structure calculations

diff --git a/source/source_base/module_container/base/macros/cuda.h b/source/source_base/module_container/base/macros/cuda.h
@@ -67,11 +67,13 @@ struct GetTypeCuda<double>
 {
     static constexpr cudaDataType cuda_data_type = cudaDataType::CUDA_R_64F;
 };
+#if CUDA_VERSION >= 11000
 template <>
 struct GetTypeCuda<int64_t>
 {
     static constexpr cudaDataType cuda_data_type = cudaDataType::CUDA_R_64I;
 };
+#endif
 template <>
 struct GetTypeCuda<std::complex<float>>
 {