diff --git a/source/source_esolver/esolver_ks_lcao_tddft.cpp b/source/source_esolver/esolver_ks_lcao_tddft.cpp
index 361e14caad5..130fc94139f 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.cpp
+++ b/source/source_esolver/esolver_ks_lcao_tddft.cpp
@@ -54,6 +54,12 @@ ESolver_KS_LCAO_TDDFT<TR, Device>::~ESolver_KS_LCAO_TDDFT()
         delete td_p;
     }
     TD_info::td_vel_op = nullptr;
+
+    if (td_mg_ != nullptr)
+    {
+        delete td_mg_;
+        td_mg_ = nullptr;
+    }
 }
 
 template <typename TR, typename Device>
@@ -94,6 +100,16 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::runner(UnitCell& ucell, const int istep)
     // 1) before_scf (electronic iteration loops)
     //----------------------------------------------------------------
     this->before_scf(ucell, istep); // From ESolver_KS_LCAO
+
+    // Initialize the moving spatial gauge
+    if (use_td_moving_gauge && this->td_mg_ == nullptr)
+    {
+        this->td_mg_ = new module_rt::TD_MovingGauge();
+        auto* hamilt_lcao = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, TR>*>(this->p_hamilt);
+        const hamilt::HContainer<TR>* sR_template = hamilt_lcao->getSR();
+        this->td_mg_->init_DR(sR_template, &ucell, &this->pv, this->two_center_bundle_.overlap_orb.get());
+    }
+
     if (PARAM.inp.td_stype == 2)
     {
         this->dmat.dm->cal_DMR_td(ucell, TD_info::cart_At);
@@ -242,6 +258,14 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::hamilt2rho_single(UnitCell& ucell,
                                                           const int iter,
                                                           const double ethr)
 {
+    // Update the moving spatial gauge
+    if (use_td_moving_gauge)
+    {
+        auto* hamilt_lcao = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, TR>*>(this->p_hamilt);
+        const hamilt::HContainer<TR>* sR_template = hamilt_lcao->getSR();
+        this->td_mg_->update_DR(sR_template, &ucell, &this->pv, this->two_center_bundle_.overlap_orb.get());
+    }
+
     if (PARAM.inp.init_wfc == "file")
     {
         if (istep >= TD_info::estep_shift + 1)
@@ -261,7 +285,11 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::hamilt2rho_single(UnitCell& ucell,
                 GlobalV::ofs_running,
                 PARAM.inp.propagator,
                 use_tensor,
-                use_lapack);
+                use_lapack,
+                this->td_mg_,
+                &ucell,
+                this->kv.kvec_d,
+                use_td_moving_gauge);
         }
         this->weight_dm_rho(ucell);
     }
@@ -281,7 +309,11 @@ void ESolver_KS_LCAO_TDDFT<TR, Device>::hamilt2rho_single(UnitCell& ucell,
                                                   GlobalV::ofs_running,
                                                   PARAM.inp.propagator,
                                                   use_tensor,
-                                                  use_lapack);
+                                                  use_lapack,
+                                                  this->td_mg_,
+                                                  &ucell,
+                                                  this->kv.kvec_d,
+                                                  use_td_moving_gauge);
         this->weight_dm_rho(ucell);
     }
     else
diff --git a/source/source_esolver/esolver_ks_lcao_tddft.h b/source/source_esolver/esolver_ks_lcao_tddft.h
index f534b303f44..b4227a9ab7d 100644
--- a/source/source_esolver/esolver_ks_lcao_tddft.h
+++ b/source/source_esolver/esolver_ks_lcao_tddft.h
@@ -7,6 +7,7 @@
 #include "source_lcao/module_rt/gather_mat.h" // MPI gathering and distributing functions
 #include "source_lcao/module_rt/kernels/cublasmp_context.h"
 #include "source_lcao/module_rt/td_info.h"
+#include "source_lcao/module_rt/td_moving_gauge.h"
 #include "source_lcao/module_rt/velocity_op.h"
 
 namespace ModuleESolver
@@ -66,6 +67,10 @@ class ESolver_KS_LCAO_TDDFT : public ESolver_KS_LCAO<std::complex<double>, TR>
 
     TD_info* td_p = nullptr;
 
+    //! Moving spatial gauge for Ehrenfest dynamics, to calculate the correction term arising from the movement of basis
+    bool use_td_moving_gauge = false;
+    module_rt::TD_MovingGauge* td_mg_ = nullptr;
+
     //! Restart flag
     bool restart_done = false;
 
diff --git a/source/source_lcao/module_rt/CMakeLists.txt b/source/source_lcao/module_rt/CMakeLists.txt
index a1056c33474..046632f3138 100644
--- a/source/source_lcao/module_rt/CMakeLists.txt
+++ b/source/source_lcao/module_rt/CMakeLists.txt
@@ -16,6 +16,7 @@ if(ENABLE_LCAO)
         td_folding.cpp
         solve_propagation.cpp
         boundary_fix.cpp
+        td_moving_gauge.cpp
     )
 
     if(USE_CUDA)
diff --git a/source/source_lcao/module_rt/evolve_elec.cpp b/source/source_lcao/module_rt/evolve_elec.cpp
index 54e234e2a56..ede4365258e 100644
--- a/source/source_lcao/module_rt/evolve_elec.cpp
+++ b/source/source_lcao/module_rt/evolve_elec.cpp
@@ -10,9 +10,9 @@
 namespace module_rt
 {
 template <typename Device>
-Evolve_elec<Device>::Evolve_elec(){};
+Evolve_elec<Device>::Evolve_elec() {};
 template <typename Device>
-Evolve_elec<Device>::~Evolve_elec(){};
+Evolve_elec<Device>::~Evolve_elec() {};
 
 template <typename Device>
 ct::DeviceType Evolve_elec<Device>::ct_device_type = ct::DeviceTypeToEnum<Device>::value;
@@ -33,7 +33,11 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
                                     std::ofstream& ofs_running,
                                     const int propagator,
                                     const bool use_tensor,
-                                    const bool use_lapack)
+                                    const bool use_lapack,
+                                    module_rt::TD_MovingGauge* td_mg,
+                                    const UnitCell* ucell,
+                                    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                                    const bool use_td_moving_gauge)
 {
     ModuleBase::TITLE("Evolve_elec", "solve_psi");
     ModuleBase::timer::start("Evolve_elec", "solve_psi");
@@ -57,6 +61,13 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
 
         if (!use_tensor)
         {
+            // Construct the local P_k matrix for moving spatial gauge, CPU only for now
+            std::vector<std::complex<double>> P_k_local(para_orb.nloc, {0.0, 0.0});
+            if (use_td_moving_gauge && td_mg != nullptr)
+            {
+                td_mg->get_P_k(ucell, kvec_d[ik], P_k_local.data(), para_orb.nloc, para_orb.ncol);
+            }
+
             const int len_HS_laststep = use_lapack ? nlocal * nlocal : para_orb.nloc;
             evolve_psi(nband,
                        nlocal,
@@ -66,6 +77,8 @@ void Evolve_elec<Device>::solve_psi(const int& istep,
                        psi_laststep[0].get_pointer(),
                        Hk_laststep.data<std::complex<double>>() + ik * len_HS_laststep,
                        Sk_laststep.data<std::complex<double>>() + ik * len_HS_laststep,
+                       P_k_local.data(),
+                       use_td_moving_gauge,
                        &(ekb(ik, 0)),
                        propagator,
                        ofs_running,
diff --git a/source/source_lcao/module_rt/evolve_elec.h b/source/source_lcao/module_rt/evolve_elec.h
index 3c2aa95cf6d..cabd57e947d 100644
--- a/source/source_lcao/module_rt/evolve_elec.h
+++ b/source/source_lcao/module_rt/evolve_elec.h
@@ -13,6 +13,7 @@
 #include "source_lcao/hamilt_lcao.h"
 #include "source_lcao/module_rt/gather_mat.h" // MPI gathering and distributing functions
 #include "source_lcao/module_rt/kernels/cublasmp_context.h"
+#include "source_lcao/module_rt/td_moving_gauge.h"
 #include "source_psi/psi.h"
 
 //-----------------------------------------------------------
@@ -158,7 +159,11 @@ class Evolve_elec
                           std::ofstream& ofs_running,
                           const int propagator,
                           const bool use_tensor,
-                          const bool use_lapack);
+                          const bool use_lapack,
+                          module_rt::TD_MovingGauge* td_mg,
+                          const UnitCell* ucell,
+                          const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                          const bool use_td_moving_gauge);
 
     // ct_device_type = ct::DeviceType::CpuDevice or ct::DeviceType::GpuDevice
     static ct::DeviceType ct_device_type;
diff --git a/source/source_lcao/module_rt/evolve_psi.cpp b/source/source_lcao/module_rt/evolve_psi.cpp
index ea3b40f293d..5f3b1556057 100644
--- a/source/source_lcao/module_rt/evolve_psi.cpp
+++ b/source/source_lcao/module_rt/evolve_psi.cpp
@@ -24,6 +24,8 @@ void evolve_psi(const int nband,
                 std::complex<double>* psi_k_laststep,
                 std::complex<double>* H_laststep,
                 std::complex<double>* S_laststep,
+                std::complex<double>* P_k,
+                const bool use_td_moving_gauge,
                 double* ekb,
                 int propagator,
                 std::ofstream& ofs_running,
@@ -85,8 +87,15 @@ void evolve_psi(const int nband,
     {
         /// @brief solve the propagation equation
         /// @input Stmp, Htmp, psi_k_laststep
-        /// @output psi_k
-        solve_propagation(pv, nband, nlocal, PARAM.inp.td_dt, Stmp, Htmp, psi_k_laststep, psi_k);
+        /// @output psi_k        
+        if (use_td_moving_gauge)
+        {
+            solve_propagation(pv, nband, nlocal, PARAM.inp.td_dt, Stmp, Htmp, P_k, psi_k_laststep, psi_k);
+        }
+        else
+        {
+            solve_propagation(pv, nband, nlocal, PARAM.inp.td_dt, Stmp, Htmp, psi_k_laststep, psi_k);
+        }
     }
 
     // (4)->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
diff --git a/source/source_lcao/module_rt/evolve_psi.h b/source/source_lcao/module_rt/evolve_psi.h
index 34a29d68812..287b47e10c0 100644
--- a/source/source_lcao/module_rt/evolve_psi.h
+++ b/source/source_lcao/module_rt/evolve_psi.h
@@ -23,6 +23,8 @@ void evolve_psi(const int nband,
                 std::complex<double>* psi_k_laststep,
                 std::complex<double>* H_laststep,
                 std::complex<double>* S_laststep,
+                std::complex<double>* P_k,
+                const bool use_td_moving_gauge,
                 double* ekb,
                 int propagator,
                 std::ofstream& ofs_running,
diff --git a/source/source_lcao/module_rt/solve_propagation.cpp b/source/source_lcao/module_rt/solve_propagation.cpp
index 298bf2eef94..f17837bf9a7 100644
--- a/source/source_lcao/module_rt/solve_propagation.cpp
+++ b/source/source_lcao/module_rt/solve_propagation.cpp
@@ -1,8 +1,9 @@
 #include "solve_propagation.h"
-#include "source_base/module_external/scalapack_connector.h"
-#include "source_base/module_external/blas_connector.h"
+
 #include "source_base/constants.h"
 #include "source_base/global_function.h"
+#include "source_base/module_external/blas_connector.h"
+#include "source_base/module_external/scalapack_connector.h"
 
 #include <iostream>
 
@@ -10,13 +11,13 @@ namespace module_rt
 {
 #ifdef __MPI
 void solve_propagation(const Parallel_Orbitals* pv,
-                        const int nband,
-                        const int nlocal,
-                        const double dt,
-                        const std::complex<double>* Stmp,
-                        const std::complex<double>* Htmp,
-                        const std::complex<double>* psi_k_laststep,
-                        std::complex<double>* psi_k)
+                       const int nband,
+                       const int nlocal,
+                       const double dt,
+                       const std::complex<double>* Stmp,
+                       const std::complex<double>* Htmp,
+                       const std::complex<double>* psi_k_laststep,
+                       std::complex<double>* psi_k)
 {
     // (1) init A,B and copy Htmp to A & B
     std::complex<double>* operator_A = new std::complex<double>[pv->nloc];
@@ -28,7 +29,7 @@ void solve_propagation(const Parallel_Orbitals* pv,
     BlasConnector::copy(pv->nloc, Htmp, 1, operator_B, 1);
 
     const double dt_au = dt / ModuleBase::AU_to_FS;
-    
+
     // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
     // (2) compute operator_A & operator_B by GEADD
     // operator_A = Stmp + i*para * Htmp;   beta2 = para = 0.25 * dt
@@ -37,78 +38,118 @@ void solve_propagation(const Parallel_Orbitals* pv,
     std::complex<double> beta1 = {0.0, -0.25 * dt_au};
     std::complex<double> beta2 = {0.0, 0.25 * dt_au};
 
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              Stmp,
-                              1,
-                              1,
-                              pv->desc,
-                              beta2,
-                              operator_A,
-                              1,
-                              1,
-                              pv->desc);
-    ScalapackConnector::geadd('N',
-                              nlocal,
-                              nlocal,
-                              alpha,
-                              Stmp,
-                              1,
-                              1,
-                              pv->desc,
-                              beta1,
-                              operator_B,
-                              1,
-                              1,
-                              pv->desc);
+    ScalapackConnector::geadd('N', nlocal, nlocal, alpha, Stmp, 1, 1, pv->desc, beta2, operator_A, 1, 1, pv->desc);
+    ScalapackConnector::geadd('N', nlocal, nlocal, alpha, Stmp, 1, 1, pv->desc, beta1, operator_B, 1, 1, pv->desc);
+    // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (3) b = operator_B @ psi_k_laststep
+    std::complex<double>* tmp_b = new std::complex<double>[pv->nloc_wfc];
+    ScalapackConnector::gemm('N',
+                             'N',
+                             nlocal,
+                             nband,
+                             nlocal,
+                             1.0,
+                             operator_B,
+                             1,
+                             1,
+                             pv->desc,
+                             psi_k_laststep,
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             tmp_b,
+                             1,
+                             1,
+                             pv->desc_wfc);
+    // get ipiv
+    int* ipiv = new int[pv->nloc];
+    int info = 0;
+    // (4) solve Ac=b
+    ScalapackConnector::gesv(nlocal, nband, operator_A, 1, 1, pv->desc, ipiv, tmp_b, 1, 1, pv->desc_wfc, &info);
+
+    // copy solution to psi_k
+    BlasConnector::copy(pv->nloc_wfc, tmp_b, 1, psi_k, 1);
+
+    delete[] tmp_b;
+    delete[] ipiv;
+    delete[] operator_A;
+    delete[] operator_B;
+}
+
+void solve_propagation(const Parallel_Orbitals* pv,
+                       const int nband,
+                       const int nlocal,
+                       const double dt,
+                       const std::complex<double>* Stmp,
+                       const std::complex<double>* Htmp,
+                       const std::complex<double>* P_k, // <--- 接收 P_k
+                       const std::complex<double>* psi_k_laststep,
+                       std::complex<double>* psi_k)
+{
+    // Print message for debugging, should be removed later
+    std::cout << "Entering solve_propagation with moving gauge P_k..." << std::endl;
+    // (1) init A, B and compute HPtmp = Htmp + P_k
+    std::complex<double>* operator_A = new std::complex<double>[pv->nloc];
+    std::complex<double>* operator_B = new std::complex<double>[pv->nloc];
+
+    // Add up Htmp and P_k to get the effective Hamiltonian matrix for moving spatial gauge
+    for (int i = 0; i < pv->nloc; ++i)
+    {
+        operator_A[i] = Htmp[i] + P_k[i];
+        operator_B[i] = Htmp[i] + P_k[i];
+    }
+
+    const double dt_au = dt / ModuleBase::AU_to_FS;
+
+    // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    // (2) compute operator_A & operator_B by GEADD
+    // operator_A = Stmp + i*para * (Htmp + P_k);
+    // operator_B = Stmp - i*para * (Htmp + P_k);
+    std::complex<double> alpha = {1.0, 0.0};
+    std::complex<double> beta1 = {0.0, -0.25 * dt_au};
+    std::complex<double> beta2 = {0.0, 0.25 * dt_au};
+
+    ScalapackConnector::geadd('N', nlocal, nlocal, alpha, Stmp, 1, 1, pv->desc, beta2, operator_A, 1, 1, pv->desc);
+    ScalapackConnector::geadd('N', nlocal, nlocal, alpha, Stmp, 1, 1, pv->desc, beta1, operator_B, 1, 1, pv->desc);
+
     // ->>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
     // (3) b = operator_B @ psi_k_laststep
     std::complex<double>* tmp_b = new std::complex<double>[pv->nloc_wfc];
     ScalapackConnector::gemm('N',
-                        'N',
-                        nlocal,
-                        nband,
-                        nlocal,
-                        1.0,
-                        operator_B,
-                        1,
-                        1,
-                        pv->desc,
-                        psi_k_laststep,
-                        1,
-                        1,
-                        pv->desc_wfc,
-                        0.0,
-                        tmp_b,
-                        1,
-                        1,
-                        pv->desc_wfc);
-    //get ipiv
+                             'N',
+                             nlocal,
+                             nband,
+                             nlocal,
+                             1.0,
+                             operator_B,
+                             1,
+                             1,
+                             pv->desc,
+                             psi_k_laststep,
+                             1,
+                             1,
+                             pv->desc_wfc,
+                             0.0,
+                             tmp_b,
+                             1,
+                             1,
+                             pv->desc_wfc);
+
+    // get ipiv
     int* ipiv = new int[pv->nloc];
     int info = 0;
+
     // (4) solve Ac=b
-    ScalapackConnector::gesv(nlocal,
-                            nband,
-                            operator_A,
-                            1,
-                            1,
-                            pv->desc,
-                            ipiv,
-                            tmp_b,
-                            1,
-                            1,
-                            pv->desc_wfc,
-                            &info);
-
-    //copy solution to psi_k
+    ScalapackConnector::gesv(nlocal, nband, operator_A, 1, 1, pv->desc, ipiv, tmp_b, 1, 1, pv->desc_wfc, &info);
+
+    // copy solution to psi_k
     BlasConnector::copy(pv->nloc_wfc, tmp_b, 1, psi_k, 1);
 
-    delete []tmp_b;
-    delete []ipiv;
-    delete []operator_A;
-    delete []operator_B;
+    delete[] tmp_b;
+    delete[] ipiv;
+    delete[] operator_A;
+    delete[] operator_B;
 }
 #endif // __MPI
 } // namespace module_rt
diff --git a/source/source_lcao/module_rt/solve_propagation.h b/source/source_lcao/module_rt/solve_propagation.h
index 309c552570d..23a38ac25c0 100644
--- a/source/source_lcao/module_rt/solve_propagation.h
+++ b/source/source_lcao/module_rt/solve_propagation.h
@@ -2,32 +2,47 @@
 #define TD_SOLVE_PROPAGATION_H
 
 #include "source_basis/module_ao/parallel_orbitals.h"
+
 #include <complex>
 
 namespace module_rt
 {
 #ifdef __MPI
 /**
-*  @brief solve propagation equation A@c(t+dt) = B@c(t)
-*
-* @param[in] pv information of parallel
-* @param[in] nband number of bands
-* @param[in] nlocal number of orbitals
-* @param[in] dt time interval
-* @param[in] Stmp overlap matrix S(t+dt/2)
-* @param[in] Htmp H(t+dt/2)
-* @param[in] psi_k_laststep psi of last step
-* @param[out] psi_k psi of this step
-*/
+ *  @brief solve propagation equation A@c(t+dt) = B@c(t)
+ *
+ * @param[in] pv information of parallel
+ * @param[in] nband number of bands
+ * @param[in] nlocal number of orbitals
+ * @param[in] dt time interval
+ * @param[in] Stmp overlap matrix S(t+dt/2)
+ * @param[in] Htmp H(t+dt/2)
+ * @param[in] psi_k_laststep psi of last step
+ * @param[out] psi_k psi of this step
+ */
 void solve_propagation(const Parallel_Orbitals* pv,
-                        const int nband,
-                        const int nlocal,
-                        const double dt,
-                        const std::complex<double>* Stmp,
-                        const std::complex<double>* Htmp,
-                        const std::complex<double>* psi_k_laststep,
-                        std::complex<double>* psi_k);
+                       const int nband,
+                       const int nlocal,
+                       const double dt,
+                       const std::complex<double>* Stmp,
+                       const std::complex<double>* Htmp,
+                       const std::complex<double>* psi_k_laststep,
+                       std::complex<double>* psi_k);
 
+/**
+ * @brief solve propagation equation A@c(t+dt) = B@c(t) with moving spatial gauge P_k
+ *
+ * @param[in] P_k moving spatial gauge matrix
+ */
+void solve_propagation(const Parallel_Orbitals* pv,
+                       const int nband,
+                       const int nlocal,
+                       const double dt,
+                       const std::complex<double>* Stmp,
+                       const std::complex<double>* Htmp,
+                       const std::complex<double>* P_k,
+                       const std::complex<double>* psi_k_laststep,
+                       std::complex<double>* psi_k);
 #endif
 } // namespace module_rt
 
diff --git a/source/source_lcao/module_rt/td_moving_gauge.cpp b/source/source_lcao/module_rt/td_moving_gauge.cpp
new file mode 100644
index 00000000000..80b7915019a
--- /dev/null
+++ b/source/source_lcao/module_rt/td_moving_gauge.cpp
@@ -0,0 +1,308 @@
+#include "td_moving_gauge.h"
+
+#include "source_base/global_function.h"
+#include "source_base/libm/libm.h" // sincos
+
+namespace module_rt
+{
+
+TD_MovingGauge::~TD_MovingGauge()
+{
+    for (int i = 0; i < nat_; ++i)
+    {
+        delete DR_x_[i];
+        delete DR_y_[i];
+        delete DR_z_[i];
+    }
+}
+
+template <typename T_sR>
+void TD_MovingGauge::init_DR(const hamilt::HContainer<T_sR>* sR_template,
+                             const UnitCell* ucell,
+                             const Parallel_Orbitals* paraV,
+                             TwoCenterIntegrator* intor)
+{
+    nat_ = ucell->nat;
+    DR_x_.resize(nat_, nullptr);
+    DR_y_.resize(nat_, nullptr);
+    DR_z_.resize(nat_, nullptr);
+
+    // 1. Allocate an HContainer for each atom
+    for (int K = 0; K < nat_; ++K)
+    {
+        DR_x_[K] = new hamilt::HContainer<double>(paraV);
+        DR_y_[K] = new hamilt::HContainer<double>(paraV);
+        DR_z_[K] = new hamilt::HContainer<double>(paraV);
+    }
+
+    // 2. Construct the sparsity pattern based on sR_template, only allocate terms where delta_{JK} is non-zero
+    for (int iap = 0; iap < sR_template->size_atom_pairs(); ++iap)
+    {
+        const auto& ap = sR_template->get_atom_pair(iap);
+        int iat1 = ap.get_atom_i();
+        int iat2 = ap.get_atom_j(); // target ket atom J
+
+        hamilt::AtomPair<double> ap_x(iat1, iat2, paraV);
+        hamilt::AtomPair<double> ap_y(iat1, iat2, paraV);
+        hamilt::AtomPair<double> ap_z(iat1, iat2, paraV);
+
+        for (int ir = 0; ir < ap.get_R_size(); ++ir)
+        {
+            auto R_idx = ap.get_R_index(ir);
+            ap_x.get_HR_values(R_idx.x, R_idx.y, R_idx.z);
+            ap_y.get_HR_values(R_idx.x, R_idx.y, R_idx.z);
+            ap_z.get_HR_values(R_idx.x, R_idx.y, R_idx.z);
+        }
+
+        // Only insert this pair into the container corresponding to atom iat2
+        DR_x_[iat2]->insert_pair(ap_x);
+        DR_y_[iat2]->insert_pair(ap_y);
+        DR_z_[iat2]->insert_pair(ap_z);
+    }
+
+    // 3. Allocate memory for all DR_[K] containers
+    for (int K = 0; K < nat_; ++K)
+    {
+        DR_x_[K]->allocate(nullptr, true);
+        DR_y_[K]->allocate(nullptr, true);
+        DR_z_[K]->allocate(nullptr, true);
+    }
+
+    // 4. Calculate and fill the R-space derivatives of the overlap matrix
+    int npol = ucell->get_npol();
+    for (int iap = 0; iap < sR_template->size_atom_pairs(); ++iap)
+    {
+        const auto& ap = sR_template->get_atom_pair(iap);
+        int iat1 = ap.get_atom_i();
+        int iat2 = ap.get_atom_j();
+
+        int T1 = ucell->iat2it[iat1];
+        int T2 = ucell->iat2it[iat2];
+        const Atom& atom1 = ucell->atoms[T1];
+        const Atom& atom2 = ucell->atoms[T2];
+
+        auto row_indexes = paraV->get_indexes_row(iat1);
+        auto col_indexes = paraV->get_indexes_col(iat2);
+
+        for (int ir = 0; ir < ap.get_R_size(); ++ir)
+        {
+            auto R_idx = ap.get_R_index(ir);
+            ModuleBase::Vector3<double> dtau = ucell->cal_dtau(iat1, iat2, R_idx);
+
+            int R_arr[3] = {R_idx.x, R_idx.y, R_idx.z};
+            double* dx = DR_x_[iat2]->data(iat1, iat2, R_arr);
+            double* dy = DR_y_[iat2]->data(iat1, iat2, R_arr);
+            double* dz = DR_z_[iat2]->data(iat1, iat2, R_arr);
+
+            for (int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+            {
+                const int iw1 = row_indexes[iw1l] / npol;
+                int L1 = atom1.iw2l[iw1];
+                int N1 = atom1.iw2n[iw1];
+                int m1 = atom1.iw2m[iw1];
+                int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+
+                for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+                {
+                    const int iw2 = col_indexes[iw2l] / npol;
+                    int L2 = atom2.iw2l[iw2];
+                    int N2 = atom2.iw2n[iw2];
+                    int m2 = atom2.iw2m[iw2];
+                    int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+
+                    double olm[4] = {0.0, 0.0, 0.0, 0.0};
+                    // out stores the integral value in olm[0], grad_out stores the gradient in olm[1] to olm[3]
+                    intor->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * ucell->lat0, &olm[0], &olm[1]);
+
+                    // Handle the spin dimension (the overlap is diagonal in spin space)
+                    for (int is1 = 0; is1 < npol; ++is1)
+                    {
+                        for (int is2 = 0; is2 < npol; ++is2)
+                        {
+                            int r_offset = iw1l + is1;
+                            int c_offset = iw2l + is2;
+                            int linear_idx = r_offset * col_indexes.size() + c_offset;
+
+                            if (is1 == is2)
+                            {
+                                dx[linear_idx] = olm[1];
+                                dy[linear_idx] = olm[2];
+                                dz[linear_idx] = olm[3];
+                            }
+                            else
+                            {
+                                dx[linear_idx] = 0.0;
+                                dy[linear_idx] = 0.0;
+                                dz[linear_idx] = 0.0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename T_sR>
+void TD_MovingGauge::update_DR(const hamilt::HContainer<T_sR>* sR_template,
+                               const UnitCell* ucell,
+                               const Parallel_Orbitals* paraV,
+                               TwoCenterIntegrator* intor)
+{
+    // Update the R-space derivatives of the overlap matrix with the current atomic positions
+    int npol = ucell->get_npol();
+    for (int iap = 0; iap < sR_template->size_atom_pairs(); ++iap)
+    {
+        const auto& ap = sR_template->get_atom_pair(iap);
+        int iat1 = ap.get_atom_i();
+        int iat2 = ap.get_atom_j();
+
+        int T1 = ucell->iat2it[iat1];
+        int T2 = ucell->iat2it[iat2];
+        const Atom& atom1 = ucell->atoms[T1];
+        const Atom& atom2 = ucell->atoms[T2];
+
+        auto row_indexes = paraV->get_indexes_row(iat1);
+        auto col_indexes = paraV->get_indexes_col(iat2);
+
+        for (int ir = 0; ir < ap.get_R_size(); ++ir)
+        {
+            auto R_idx = ap.get_R_index(ir);
+            ModuleBase::Vector3<double> dtau = ucell->cal_dtau(iat1, iat2, R_idx);
+
+            int R_arr[3] = {R_idx.x, R_idx.y, R_idx.z};
+            double* dx = DR_x_[iat2]->data(iat1, iat2, R_arr);
+            double* dy = DR_y_[iat2]->data(iat1, iat2, R_arr);
+            double* dz = DR_z_[iat2]->data(iat1, iat2, R_arr);
+
+            for (int iw1l = 0; iw1l < row_indexes.size(); iw1l += npol)
+            {
+                const int iw1 = row_indexes[iw1l] / npol;
+                int L1 = atom1.iw2l[iw1];
+                int N1 = atom1.iw2n[iw1];
+                int m1 = atom1.iw2m[iw1];
+                int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+
+                for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
+                {
+                    const int iw2 = col_indexes[iw2l] / npol;
+                    int L2 = atom2.iw2l[iw2];
+                    int N2 = atom2.iw2n[iw2];
+                    int m2 = atom2.iw2m[iw2];
+                    int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+
+                    double olm[4] = {0.0, 0.0, 0.0, 0.0};
+                    intor->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * ucell->lat0, &olm[0], &olm[1]);
+
+                    for (int is1 = 0; is1 < npol; ++is1)
+                    {
+                        for (int is2 = 0; is2 < npol; ++is2)
+                        {
+                            int r_offset = iw1l + is1;
+                            int c_offset = iw2l + is2;
+                            int linear_idx = r_offset * col_indexes.size() + c_offset;
+
+                            if (is1 == is2)
+                            {
+                                dx[linear_idx] = olm[1];
+                                dy[linear_idx] = olm[2];
+                                dz[linear_idx] = olm[3];
+                            }
+                            else
+                            {
+                                dx[linear_idx] = 0.0;
+                                dy[linear_idx] = 0.0;
+                                dz[linear_idx] = 0.0;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <typename TK>
+void TD_MovingGauge::get_D_k(int K, const ModuleBase::Vector3<double>& kvec_d, TK* Dk_x, TK* Dk_y, TK* Dk_z, int hk_ld)
+    const
+{
+    hamilt::folding_HR(*(DR_x_[K]), Dk_x, kvec_d, hk_ld, 1);
+    hamilt::folding_HR(*(DR_y_[K]), Dk_y, kvec_d, hk_ld, 1);
+    hamilt::folding_HR(*(DR_z_[K]), Dk_z, kvec_d, hk_ld, 1);
+}
+
+template <typename TK>
+void TD_MovingGauge::get_P_k(const UnitCell* ucell,
+                             const ModuleBase::Vector3<double>& kvec_d,
+                             TK* P_k,
+                             int matrix_size,
+                             int hk_ld) const
+{
+    std::vector<TK> Dk_x(matrix_size, TK(0.0, 0.0));
+    std::vector<TK> Dk_y(matrix_size, TK(0.0, 0.0));
+    std::vector<TK> Dk_z(matrix_size, TK(0.0, 0.0));
+
+    for (int K = 0; K < nat_; ++K)
+    {
+        std::fill(Dk_x.begin(), Dk_x.end(), TK(0.0, 0.0));
+        std::fill(Dk_y.begin(), Dk_y.end(), TK(0.0, 0.0));
+        std::fill(Dk_z.begin(), Dk_z.end(), TK(0.0, 0.0));
+
+        this->get_D_k(K, kvec_d, Dk_x.data(), Dk_y.data(), Dk_z.data(), hk_ld);
+
+        // Obtain the real-time velocity of atom K from the UnitCell (in Hartree atomic units)
+        int it = ucell->iat2it[K];
+        int ia = ucell->iat2ia[K];
+        double vx = ucell->atoms[it].vel[ia].x;
+        double vy = ucell->atoms[it].vel[ia].y;
+        double vz = ucell->atoms[it].vel[ia].z;
+
+        // Construct the coefficients: P = -i * v * D
+        // Unit conversion: Hartree a.u. to Rydberg a.u. requires multiplying
+        TK coef_x(0.0, -2.0 * vx);
+        TK coef_y(0.0, -2.0 * vy);
+        TK coef_z(0.0, -2.0 * vz);
+
+        // Accumulate the contribution from atom K to the P_k matrix
+        for (int i = 0; i < matrix_size; ++i)
+        {
+            P_k[i] += coef_x * Dk_x[i] + coef_y * Dk_y[i] + coef_z * Dk_z[i];
+        }
+    }
+}
+
+template void TD_MovingGauge::init_DR<double>(const hamilt::HContainer<double>* sR_template,
+                                              const UnitCell* ucell,
+                                              const Parallel_Orbitals* paraV,
+                                              TwoCenterIntegrator* intor);
+
+template void TD_MovingGauge::init_DR<std::complex<double>>(const hamilt::HContainer<std::complex<double>>* sR_template,
+                                                            const UnitCell* ucell,
+                                                            const Parallel_Orbitals* paraV,
+                                                            TwoCenterIntegrator* intor);
+
+template void TD_MovingGauge::update_DR<double>(const hamilt::HContainer<double>* sR_template,
+                                                const UnitCell* ucell,
+                                                const Parallel_Orbitals* paraV,
+                                                TwoCenterIntegrator* intor);
+
+template void TD_MovingGauge::update_DR<std::complex<double>>(
+    const hamilt::HContainer<std::complex<double>>* sR_template,
+    const UnitCell* ucell,
+    const Parallel_Orbitals* paraV,
+    TwoCenterIntegrator* intor);
+
+template void TD_MovingGauge::get_D_k<std::complex<double>>(int K,
+                                                            const ModuleBase::Vector3<double>& kvec_d,
+                                                            std::complex<double>* Dk_x,
+                                                            std::complex<double>* Dk_y,
+                                                            std::complex<double>* Dk_z,
+                                                            int hk_ld) const;
+
+template void TD_MovingGauge::get_P_k<std::complex<double>>(const UnitCell* ucell,
+                                                            const ModuleBase::Vector3<double>& kvec_d,
+                                                            std::complex<double>* P_k,
+                                                            int matrix_size,
+                                                            int hk_ld) const;
+
+} // namespace module_rt
diff --git a/source/source_lcao/module_rt/td_moving_gauge.h b/source/source_lcao/module_rt/td_moving_gauge.h
new file mode 100644
index 00000000000..449d800ba2e
--- /dev/null
+++ b/source/source_lcao/module_rt/td_moving_gauge.h
@@ -0,0 +1,61 @@
+#ifndef TD_MOVING_GAUGE_H
+#define TD_MOVING_GAUGE_H
+
+#include "source_basis/module_ao/parallel_orbitals.h"
+#include "source_basis/module_nao/two_center_integrator.h"
+#include "source_cell/unitcell.h"
+#include "source_lcao/module_hcontainer/hcontainer.h"
+#include "source_lcao/module_hcontainer/hcontainer_funcs.h"
+
+#include <complex>
+#include <vector>
+
+namespace module_rt
+{
+
+class TD_MovingGauge
+{
+  public:
+    TD_MovingGauge() = default;
+    ~TD_MovingGauge();
+
+    // Initialize the R-space derivative matrices D_R (x, y, z)
+    // using the provided sR_template for consistent sparse atomic pair topology
+    // D_{K,\mu\nu}(R) = <\phi_{\mu 0}|∂\phi_{\nu R}/∂\tau_K> where tau_K is the position of atom K
+    template <typename T_sR>
+    void init_DR(const hamilt::HContainer<T_sR>* sR_template,
+                 const UnitCell* ucell,
+                 const Parallel_Orbitals* paraV,
+                 TwoCenterIntegrator* intor);
+
+    // Update the R-space matrix D_R (x, y, z)
+    template <typename T_sR>
+    void update_DR(const hamilt::HContainer<T_sR>* sR_template,
+                   const UnitCell* ucell,
+                   const Parallel_Orbitals* paraV,
+                   TwoCenterIntegrator* intor);
+
+    // Fourier transform D(R) to D(k)
+    // Note: folding_HR performs an accumulation (+=) operation, need to ensure Dk matrices are zeroed before calling
+    // D_{K,\mu\nu}(k) = \sum_R e^{ikR} D_{K,\mu\nu}(R)
+    template <typename TK>
+    void get_D_k(int K, const ModuleBase::Vector3<double>& kvec_d, TK* Dk_x, TK* Dk_y, TK* Dk_z, int hk_ld) const;
+
+    // Calculate the moving spatial gauge matrix P_k and accumulate it to the input P_k matrix
+    // Note: The unit is converted to Rydberg atomic units, and multiplied by 2 internally
+    // P_{\mu\nu}(k) = -i \sum_K vel_K \cdot D_{K,\mu\nu}(k) where vel_K is the velocity of atom K
+    template <typename TK>
+    void get_P_k(const UnitCell* ucell, const ModuleBase::Vector3<double>& kvec_d, TK* P_k, int matrix_size, int hk_ld)
+        const;
+
+  private:
+    int nat_ = 0;
+
+    std::vector<hamilt::HContainer<double>*> DR_x_;
+    std::vector<hamilt::HContainer<double>*> DR_y_;
+    std::vector<hamilt::HContainer<double>*> DR_z_;
+};
+
+} // namespace module_rt
+
+#endif // TD_MOVING_GAUGE_H