diff --git a/python/pyabacus/src/hsolver/py_diago_cg.hpp b/python/pyabacus/src/hsolver/py_diago_cg.hpp
index f1f84e9a77..f907e2e764 100644
--- a/python/pyabacus/src/hsolver/py_diago_cg.hpp
+++ b/python/pyabacus/src/hsolver/py_diago_cg.hpp
@@ -153,8 +153,6 @@ class PyDiagoCG
             const int nrow   = ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1);
             const int nbands = ndim == 1 ? 1 : psi_in.shape().dim_size(0);
             syncmem_z2z_h2h_op()(
-                this->ctx,
-                this->ctx,
                 spsi_out.data<std::complex<double>>(), 
                 psi_in.data<std::complex<double>>(), 
                 static_cast<size_t>(nrow * nbands)
diff --git a/python/pyabacus/src/hsolver/py_diago_david.hpp b/python/pyabacus/src/hsolver/py_diago_david.hpp
index 8a8d2c727e..7087af632e 100644
--- a/python/pyabacus/src/hsolver/py_diago_david.hpp
+++ b/python/pyabacus/src/hsolver/py_diago_david.hpp
@@ -135,7 +135,7 @@ class PyDiagoDavid
             const int nrow, 
             const int nbands
         ) {
-            syncmem_op()(this->ctx, this->ctx, spsi_out, psi_in, static_cast<size_t>(nbands * nrow));
+            syncmem_op()(spsi_out, psi_in, static_cast<size_t>(nbands * nrow));
         };
 
         obj = std::make_unique<hsolver::DiagoDavid<std::complex<double>, base_device::DEVICE_CPU>>(
diff --git a/source/module_base/kernels/dsp/dsp_connector.h b/source/module_base/kernels/dsp/dsp_connector.h
index b51c67663e..ea0d17749e 100644
--- a/source/module_base/kernels/dsp/dsp_connector.h
+++ b/source/module_base/kernels/dsp/dsp_connector.h
@@ -75,7 +75,7 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv
 
 	auto* swap = new T[notconv * nbase_x];
     auto* target = new T[notconv * nbase_x];
-    syncmem_complex_op()(cpu_ctx, cpu_ctx, swap, hcc + nbase * nbase_x, notconv * nbase_x);
+    syncmem_complex_op()(swap, hcc + nbase * nbase_x, notconv * nbase_x);
     if (base_device::get_current_precision(swap) == "single")
     {
         MPI_Reduce(swap,
@@ -97,8 +97,8 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv
                     diag_comm);
     }
 
-    syncmem_complex_op()(cpu_ctx, cpu_ctx, hcc + nbase * nbase_x, target, notconv * nbase_x);
-    syncmem_complex_op()(cpu_ctx, cpu_ctx, swap, scc + nbase * nbase_x, notconv * nbase_x);
+    syncmem_complex_op()(hcc + nbase * nbase_x, target, notconv * nbase_x);
+    syncmem_complex_op()(swap, scc + nbase * nbase_x, notconv * nbase_x);
 
     if (base_device::get_current_precision(swap) == "single")
     {
@@ -121,7 +121,7 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv
                     diag_comm);
     }
 
-    syncmem_complex_op()(cpu_ctx, cpu_ctx, scc + nbase * nbase_x, target, notconv * nbase_x);
+    syncmem_complex_op()(scc + nbase * nbase_x, target, notconv * nbase_x);
     delete[] swap;
     delete[] target;
 }
diff --git a/source/module_base/kernels/test/math_op_test.cpp b/source/module_base/kernels/test/math_op_test.cpp
index 7136ab8d35..cfdedb234e 100644
--- a/source/module_base/kernels/test/math_op_test.cpp
+++ b/source/module_base/kernels/test/math_op_test.cpp
@@ -306,13 +306,13 @@ TEST_F(TestModuleBaseMathMultiDevice, cal_ylm_real_op_gpu)
     std::vector<double> ylm(expected_ylm.size(), 0.0);
     double * d_ylm = nullptr, * d_g = nullptr, * d_p = nullptr;
 
-    resmem_var_op()(gpu_ctx, d_g, g.size());
-    resmem_var_op()(gpu_ctx, d_p, p.size());
-    resmem_var_op()(gpu_ctx, d_ylm, ylm.size());
+    resmem_var_op()(d_g, g.size());
+    resmem_var_op()(d_p, p.size());
+    resmem_var_op()(d_ylm, ylm.size());
 
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_g, g.data(), g.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_p, p.data(), p.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_ylm, ylm.data(), ylm.size());
+    syncmem_var_h2d_op()(d_g, g.data(), g.size());
+    syncmem_var_h2d_op()(d_p, p.data(), p.size());
+    syncmem_var_h2d_op()(d_ylm, ylm.data(), ylm.size());
 
     ModuleBase::cal_ylm_real_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                                    ng,
@@ -326,15 +326,15 @@ TEST_F(TestModuleBaseMathMultiDevice, cal_ylm_real_op_gpu)
                                                                    d_p,
                                                                    d_ylm);
 
-    syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, ylm.data(), d_ylm, ylm.size());
+    syncmem_var_d2h_op()(ylm.data(), d_ylm, ylm.size());
 
     for (int ii = 0; ii < ylm.size(); ii++) {
         EXPECT_LT(fabs(ylm[ii] - expected_ylm[ii]), 6e-5);
     }
 
-    delmem_var_op()(gpu_ctx, d_g);
-    delmem_var_op()(gpu_ctx, d_p);
-    delmem_var_op()(gpu_ctx, d_ylm);
+    delmem_var_op()(d_g);
+    delmem_var_op()(d_p);
+    delmem_var_op()(d_ylm);
 }
 
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_base/math_chebyshev.cpp b/source/module_base/math_chebyshev.cpp
index 9bfac7cac9..b2cc6aadea 100644
--- a/source/module_base/math_chebyshev.cpp
+++ b/source/module_base/math_chebyshev.cpp
@@ -63,8 +63,8 @@ Chebyshev<REAL, Device>::Chebyshev(const int norder_in) : fftw(2 * EXTEND * nord
     coefc_cpu = new std::complex<REAL>[norder];
     if (base_device::get_device_type<Device>(this->ctx) == base_device::GpuDevice)
     {
-        resmem_var_op()(this->ctx, this->coef_real, norder);
-        resmem_complex_op()(this->ctx, this->coef_complex, norder);
+        resmem_var_op()(this->coef_real, norder);
+        resmem_complex_op()(this->coef_complex, norder);
     }
     else
     {
@@ -84,8 +84,8 @@ Chebyshev<REAL, Device>::~Chebyshev()
     delete[] polytrace;
     if (base_device::get_device_type<Device>(this->ctx) == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, this->coef_real);
-        delmem_complex_op()(this->ctx, this->coef_complex);
+        delmem_var_op()(this->coef_real);
+        delmem_complex_op()(this->coef_complex);
     }
     else
     {
@@ -129,10 +129,10 @@ REAL Chebyshev<REAL, Device>::ddot_real(const std::complex<REAL>* psi_L,
         pL = (REAL*)psi_L;
         pR = (REAL*)psi_R;
         REAL* dot_device = nullptr;
-        resmem_var_op()(this->ctx, dot_device, 1);
+        resmem_var_op()(dot_device, 1);
         container::kernels::blas_dot<REAL, ct_Device>()(dim2, pL, 1, pR, 1, dot_device);
-        syncmem_var_d2h_op()(cpu_ctx, this->ctx, &result, dot_device, 1);
-        delmem_var_op()(this->ctx, dot_device);
+        syncmem_var_d2h_op()(&result, dot_device, 1);
+        delmem_var_op()(dot_device);
     }
     else
     {
@@ -140,18 +140,18 @@ REAL Chebyshev<REAL, Device>::ddot_real(const std::complex<REAL>* psi_L,
         pL = (REAL*)psi_L;
         pR = (REAL*)psi_R;
         REAL* dot_device = nullptr;
-        resmem_var_op()(this->ctx, dot_device, 1);
+        resmem_var_op()(dot_device, 1);
         for (int i = 0; i < m; ++i)
         {
             int dim2 = 2 * N;
             container::kernels::blas_dot<REAL, ct_Device>()(dim2, pL, 1, pR, 1, dot_device);
             REAL result_temp = 0;
-            syncmem_var_d2h_op()(cpu_ctx, this->ctx, &result_temp, dot_device, 1);
+            syncmem_var_d2h_op()(&result_temp, dot_device, 1);
             result += result_temp;
             pL += 2 * LDA;
             pR += 2 * LDA;
         }
-        delmem_var_op()(this->ctx, dot_device);
+        delmem_var_op()(dot_device);
     }
     return result;
 }
@@ -211,7 +211,7 @@ void Chebyshev<REAL, Device>::calcoef_real(std::function<REAL(REAL)> fun)
 
     if (base_device::get_device_type<Device>(this->ctx) == base_device::GpuDevice)
     {
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, coef_real, coefr_cpu, norder);
+        syncmem_var_h2d_op()(coef_real, coefr_cpu, norder);
     }
 
     getcoef_real = true;
@@ -301,7 +301,7 @@ void Chebyshev<REAL, Device>::calcoef_complex(std::function<std::complex<REAL>(s
     }
     if (base_device::get_device_type<Device>(this->ctx) == base_device::GpuDevice)
     {
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, coef_complex, coefc_cpu, norder);
+        syncmem_complex_h2d_op()(coef_complex, coefc_cpu, norder);
     }
 
     getcoef_complex = true;
@@ -392,7 +392,7 @@ void Chebyshev<REAL, Device>::calcoef_pair(std::function<REAL(REAL)> fun1, std::
 
     if (base_device::get_device_type<Device>(this->ctx) == base_device::GpuDevice)
     {
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, coef_complex, coefc_cpu, norder);
+        syncmem_complex_h2d_op()(coef_complex, coefc_cpu, norder);
     }
 
     getcoef_complex = true;
@@ -427,17 +427,17 @@ void Chebyshev<REAL, Device>::calfinalvec_real(
         ndmxt = LDA * m;
     }
 
-    resmem_complex_op()(this->ctx, arraynp1, ndmxt);
-    resmem_complex_op()(this->ctx, arrayn, ndmxt);
-    resmem_complex_op()(this->ctx, arrayn_1, ndmxt);
+    resmem_complex_op()(arraynp1, ndmxt);
+    resmem_complex_op()(arrayn, ndmxt);
+    resmem_complex_op()(arrayn_1, ndmxt);
 
-    memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, ndmxt);
+    memcpy_complex_op()(arrayn_1, wavein, ndmxt);
     // ModuleBase::GlobalFunc::DCOPY(wavein, arrayn_1, ndmxt);
 
     funA(arrayn_1, arrayn, m);
 
     // 0- & 1-st order
-    setmem_complex_op()(this->ctx, waveout, 0, ndmxt);
+    setmem_complex_op()(waveout, 0, ndmxt);
     std::complex<REAL> coef0 = std::complex<REAL>(coefr_cpu[0], 0);
     container::kernels::blas_axpy<std::complex<REAL>, ct_Device>()(ndmxt, &coef0, arrayn_1, 1, waveout, 1);
     std::complex<REAL> coef1 = std::complex<REAL>(coefr_cpu[1], 0);
@@ -462,9 +462,9 @@ void Chebyshev<REAL, Device>::calfinalvec_real(
         arrayn = arraynp1;
         arraynp1 = tem;
     }
-    delmem_complex_op()(this->ctx, arraynp1);
-    delmem_complex_op()(this->ctx, arrayn);
-    delmem_complex_op()(this->ctx, arrayn_1);
+    delmem_complex_op()(arraynp1);
+    delmem_complex_op()(arrayn);
+    delmem_complex_op()(arrayn_1);
     return;
 }
 
@@ -496,16 +496,16 @@ void Chebyshev<REAL, Device>::calfinalvec_complex(
         ndmxt = LDA * m;
     }
 
-    resmem_complex_op()(this->ctx, arraynp1, ndmxt);
-    resmem_complex_op()(this->ctx, arrayn, ndmxt);
-    resmem_complex_op()(this->ctx, arrayn_1, ndmxt);
+    resmem_complex_op()(arraynp1, ndmxt);
+    resmem_complex_op()(arrayn, ndmxt);
+    resmem_complex_op()(arrayn_1, ndmxt);
 
-    memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, ndmxt);
+    memcpy_complex_op()(arrayn_1, wavein, ndmxt);
 
     funA(arrayn_1, arrayn, m);
 
     // 0- & 1-st order
-    setmem_complex_op()(this->ctx, waveout, 0, ndmxt);
+    setmem_complex_op()(waveout, 0, ndmxt);
     container::kernels::blas_axpy<std::complex<REAL>, ct_Device>()(ndmxt, &coefc_cpu[0], arrayn_1, 1, waveout, 1);
     container::kernels::blas_axpy<std::complex<REAL>, ct_Device>()(ndmxt, &coefc_cpu[1], arrayn, 1, waveout, 1);
     // for (int i = 0; i < ndmxt; ++i)
@@ -527,9 +527,9 @@ void Chebyshev<REAL, Device>::calfinalvec_complex(
         arrayn = arraynp1;
         arraynp1 = tem;
     }
-    delmem_complex_op()(this->ctx, arraynp1);
-    delmem_complex_op()(this->ctx, arrayn);
-    delmem_complex_op()(this->ctx, arrayn_1);
+    delmem_complex_op()(arraynp1);
+    delmem_complex_op()(arrayn);
+    delmem_complex_op()(arrayn_1);
     return;
 }
 
@@ -553,7 +553,7 @@ void Chebyshev<REAL, Device>::calpolyvec_complex(
     std::complex<REAL>*tmpin = wavein, *tmpout = arrayn_1;
     for (int i = 0; i < m; ++i)
     {
-        memcpy_complex_op()(this->ctx, this->ctx, tmpout, tmpin, N);
+        memcpy_complex_op()(tmpout, tmpin, N);
         // ModuleBase::GlobalFunc::DCOPY(tmpin, tmpout, N);
         tmpin += LDA;
         tmpout += LDA;
@@ -595,11 +595,11 @@ void Chebyshev<REAL, Device>::tracepolyA(
         ndmxt = LDA * m;
     }
 
-    resmem_complex_op()(this->ctx, arraynp1, ndmxt);
-    resmem_complex_op()(this->ctx, arrayn, ndmxt);
-    resmem_complex_op()(this->ctx, arrayn_1, ndmxt);
+    resmem_complex_op()(arraynp1, ndmxt);
+    resmem_complex_op()(arrayn, ndmxt);
+    resmem_complex_op()(arrayn_1, ndmxt);
 
-    memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, ndmxt);
+    memcpy_complex_op()(arrayn_1, wavein, ndmxt);
     // ModuleBase::GlobalFunc::DCOPY(wavein, arrayn_1, ndmxt);
 
     funA(arrayn_1, arrayn, m);
@@ -618,9 +618,9 @@ void Chebyshev<REAL, Device>::tracepolyA(
         arraynp1 = tem;
     }
 
-    delmem_complex_op()(this->ctx, arraynp1);
-    delmem_complex_op()(this->ctx, arrayn);
-    delmem_complex_op()(this->ctx, arrayn_1);
+    delmem_complex_op()(arraynp1);
+    delmem_complex_op()(arrayn);
+    delmem_complex_op()(arrayn_1);
     return;
 }
 
@@ -669,11 +669,11 @@ bool Chebyshev<REAL, Device>::checkconverge(
     std::complex<REAL>* arrayn = nullptr;
     std::complex<REAL>* arrayn_1 = nullptr;
 
-    resmem_complex_op()(this->ctx, arraynp1, LDA);
-    resmem_complex_op()(this->ctx, arrayn, LDA);
-    resmem_complex_op()(this->ctx, arrayn_1, LDA);
+    resmem_complex_op()(arraynp1, LDA);
+    resmem_complex_op()(arrayn, LDA);
+    resmem_complex_op()(arrayn_1, LDA);
 
-    memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, N);
+    memcpy_complex_op()(arrayn_1, wavein, N);
     // ModuleBase::GlobalFunc::DCOPY(wavein, arrayn_1, N);
 
     if (tmin == tmax)
@@ -754,9 +754,9 @@ bool Chebyshev<REAL, Device>::checkconverge(
         arraynp1 = tem;
     }
 
-    delmem_complex_op()(this->ctx, arraynp1);
-    delmem_complex_op()(this->ctx, arrayn);
-    delmem_complex_op()(this->ctx, arrayn_1);
+    delmem_complex_op()(arraynp1);
+    delmem_complex_op()(arrayn);
+    delmem_complex_op()(arrayn_1);
     return converge;
 }
 
diff --git a/source/module_base/math_ylmreal.cpp b/source/module_base/math_ylmreal.cpp
index 953112996a..fac76cf959 100644
--- a/source/module_base/math_ylmreal.cpp
+++ b/source/module_base/math_ylmreal.cpp
@@ -327,7 +327,7 @@ void YlmReal::Ylm_Real(Device * ctx, const int lmax2, const int ng, const FPTYPE
         ModuleBase::WARNING_QUIT("YLM_REAL","l>30 or l<0");
     }
     FPTYPE * p = nullptr, * phi = nullptr, * cost = nullptr;
-    resmem_var_op()(ctx, p, (lmax + 1) * (lmax + 1) * ng, "YlmReal::Ylm_Real");
+    resmem_var_op()(p, (lmax + 1) * (lmax + 1) * ng, "YlmReal::Ylm_Real");
 
     cal_ylm_real_op()(
         ctx,
@@ -342,9 +342,9 @@ void YlmReal::Ylm_Real(Device * ctx, const int lmax2, const int ng, const FPTYPE
         p,
         ylm);
 
-    delmem_var_op()(ctx, p);
-    delmem_var_op()(ctx, phi);
-    delmem_var_op()(ctx, cost);
+    delmem_var_op()(p);
+    delmem_var_op()(phi);
+    delmem_var_op()(cost);
 } // end subroutine ylmr2
 
 //==========================================================
diff --git a/source/module_base/module_device/cuda/memory_op.cu b/source/module_base/module_device/cuda/memory_op.cu
index bc9384c446..c4f9efdb42 100644
--- a/source/module_base/module_device/cuda/memory_op.cu
+++ b/source/module_base/module_device/cuda/memory_op.cu
@@ -52,14 +52,13 @@ __global__ void cast_memory(std::complex<FPTYPE_out>* out, const FPTYPE_in* in,
 }
 
 template <typename FPTYPE>
-void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-                                                                   FPTYPE*& arr,
+void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE*& arr,
                                                                    const size_t size,
                                                                    const char* record_in)
 {
     if (arr != nullptr)
     {
-        delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(dev, arr);
+        delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(arr);
     }
     cudaErrcheck(cudaMalloc((void**)&arr, sizeof(FPTYPE) * size));
     std::string record_string;
@@ -79,8 +78,7 @@ void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_de
 }
 
 template <typename FPTYPE>
-void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-                                                                FPTYPE* arr,
+void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
                                                                 const int var,
                                                                 const size_t size)
 {
@@ -89,8 +87,6 @@ void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_devic
 
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_CPU* dev_out,
-    const base_device::DEVICE_GPU* dev_in,
     FPTYPE* arr_out,
     const FPTYPE* arr_in,
     const size_t size)
@@ -100,8 +96,6 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_
 
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
-    const base_device::DEVICE_GPU* dev_out,
-    const base_device::DEVICE_CPU* dev_in,
     FPTYPE* arr_out,
     const FPTYPE* arr_in,
     const size_t size)
@@ -111,8 +105,6 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
 
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* dev_out,
-    const base_device::DEVICE_GPU* dev_in,
     FPTYPE* arr_out,
     const FPTYPE* arr_in,
     const size_t size)
@@ -123,9 +115,7 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size)
     {
@@ -142,9 +132,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU> {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_CPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size) {
 
@@ -152,9 +140,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
         // No need to cast the memory if the data types are the same.
         if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
         {
-            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(dev_out,
-                                                                                                  dev_in,
-                                                                                                  arr_out,
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(arr_out,
                                                                                                   reinterpret_cast<const FPTYPE_out*>(arr_in),
                                                                                                   size);
             return;
@@ -171,18 +157,14 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU> {
-    void operator()(const base_device::DEVICE_CPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size) {
         if (size == 0) {return;}
         // No need to cast the memory if the data types are the same.
         if (std::is_same<FPTYPE_out, FPTYPE_in>::value)
         {
-            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(dev_out,
-                                                                                                  dev_in,
-                                                                                                  arr_out,
+            synchronize_memory_op<FPTYPE_out, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(arr_out,
                                                                                                   reinterpret_cast<const FPTYPE_out*>(arr_in),
                                                                                                   size);
             return;
@@ -197,7 +179,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_devic
 };
 
 template <typename FPTYPE>
-void delete_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr)
+void delete_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr)
 {
     cudaErrcheck(cudaFree(arr));
 }
diff --git a/source/module_base/module_device/memory_op.cpp b/source/module_base/module_device/memory_op.cpp
index 3c807dfad7..525ecee89f 100644
--- a/source/module_base/module_device/memory_op.cpp
+++ b/source/module_base/module_device/memory_op.cpp
@@ -18,7 +18,7 @@ namespace memory
 template <typename FPTYPE>
 struct resize_memory_op<FPTYPE, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev, FPTYPE*& arr, const size_t size, const char* record_in)
+    void operator()(FPTYPE*& arr, const size_t size, const char* record_in)
     {
         if (arr != nullptr)
         {
@@ -45,7 +45,7 @@ struct resize_memory_op<FPTYPE, base_device::DEVICE_CPU>
 template <typename FPTYPE>
 struct set_memory_op<FPTYPE, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr, const int var, const size_t size)
+    void operator()(FPTYPE* arr, const int var, const size_t size)
     {
         ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) {
             int beg = 0, len = 0;
@@ -58,9 +58,7 @@ struct set_memory_op<FPTYPE, base_device::DEVICE_CPU>
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev_out,
-                    const base_device::DEVICE_CPU* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size)
     {
@@ -75,9 +73,7 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVIC
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev_out,
-                    const base_device::DEVICE_CPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size)
     {
@@ -94,7 +90,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_devic
 template <typename FPTYPE>
 struct delete_memory_op<FPTYPE, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr)
+    void operator()(FPTYPE* arr)
     {
         free(arr);
     }
@@ -156,8 +152,7 @@ template struct delete_memory_op<std::complex<double>*, base_device::DEVICE_CPU>
 template <typename FPTYPE>
 struct resize_memory_op<FPTYPE, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev,
-                    FPTYPE*& arr,
+    void operator()(FPTYPE*& arr,
                     const size_t size,
                     const char* record_in = nullptr)
     {
@@ -167,7 +162,7 @@ struct resize_memory_op<FPTYPE, base_device::DEVICE_GPU>
 template <typename FPTYPE>
 struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr, const int var, const size_t size)
+    void operator()(FPTYPE* arr, const int var, const size_t size)
     {
     }
 };
@@ -175,9 +170,7 @@ struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size)
     {
@@ -187,9 +180,7 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVIC
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_CPU* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size)
     {
@@ -199,9 +190,7 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVIC
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size)
     {
@@ -211,9 +200,7 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVIC
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size)
     {
@@ -223,9 +210,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_CPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size)
     {
@@ -235,9 +220,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size)
     {
@@ -247,7 +230,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_devic
 template <typename FPTYPE>
 struct delete_memory_op<FPTYPE, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr)
+    void operator()(FPTYPE* arr)
     {
     }
 };
@@ -353,7 +336,7 @@ template struct delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>;
 template <typename FPTYPE>
 struct resize_memory_op_mt<FPTYPE, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev, FPTYPE*& arr, const size_t size, const char* record_in)
+    void operator()(FPTYPE*& arr, const size_t size, const char* record_in)
     {
         if (arr != nullptr)
         {
@@ -380,7 +363,7 @@ struct resize_memory_op_mt<FPTYPE, base_device::DEVICE_CPU>
 template <typename FPTYPE>
 struct delete_memory_op_mt<FPTYPE, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr)
+    void operator()(FPTYPE* arr)
     {
         free_ht(arr);
     }
@@ -401,39 +384,39 @@ template struct delete_memory_op_mt<std::complex<double>, base_device::DEVICE_CP
 #endif
 
 template <typename FPTYPE>
-void resize_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type)
+void resize_memory(FPTYPE* arr, const size_t size, base_device::AbacusDevice_t device_type)
 {
     if (device_type == base_device::AbacusDevice_t::CpuDevice){
-        resize_memory_op<FPTYPE, base_device::DEVICE_CPU>()(cpu_ctx, arr);
+        resize_memory_op<FPTYPE, base_device::DEVICE_CPU>()(arr, size);
     }
     else if (device_type == base_device::AbacusDevice_t::GpuDevice){
-        resize_memory_op<FPTYPE, base_device::DEVICE_GPU>()(gpu_ctx, arr);
+        resize_memory_op<FPTYPE, base_device::DEVICE_GPU>()(arr, size);
     }
 }
 
 template <typename FPTYPE>
 void set_memory(FPTYPE* arr, const int var, const size_t size, base_device::AbacusDevice_t device_type){
     if (device_type == base_device::AbacusDevice_t::CpuDevice){
-        set_memory_op<FPTYPE, base_device::DEVICE_CPU>()(cpu_ctx, arr, var, size);
+        set_memory_op<FPTYPE, base_device::DEVICE_CPU>()(arr, var, size);
     }
     else if (device_type == base_device::AbacusDevice_t::GpuDevice){
-        set_memory_op<FPTYPE, base_device::DEVICE_GPU>()(gpu_ctx, arr, var, size);
+        set_memory_op<FPTYPE, base_device::DEVICE_GPU>()(arr, var, size);
     }
 }
 
 template <typename FPTYPE>
 void synchronize_memory(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size, base_device::AbacusDevice_t device_type_out, base_device::AbacusDevice_t device_type_in){
     if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){
-        synchronize_memory_op<FPTYPE, DEVICE_CPU, DEVICE_CPU>()(cpu_ctx, cpu_ctx, arr_out, arr_in, size);
+        synchronize_memory_op<FPTYPE, DEVICE_CPU, DEVICE_CPU>()(arr_out, arr_in, size);
     }
     else if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){
-        synchronize_memory_op<FPTYPE, DEVICE_CPU, DEVICE_GPU>()(cpu_ctx, gpu_ctx, arr_out, arr_in, size);
+        synchronize_memory_op<FPTYPE, DEVICE_CPU, DEVICE_GPU>()(arr_out, arr_in, size);
     }
     else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){
-        synchronize_memory_op<FPTYPE, DEVICE_GPU, DEVICE_CPU>()(gpu_ctx, cpu_ctx, arr_out, arr_in, size);
+        synchronize_memory_op<FPTYPE, DEVICE_GPU, DEVICE_CPU>()(arr_out, arr_in, size);
     }
     else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){
-        synchronize_memory_op<FPTYPE, DEVICE_GPU, DEVICE_GPU>()(gpu_ctx, gpu_ctx, arr_out, arr_in, size);
+        synchronize_memory_op<FPTYPE, DEVICE_GPU, DEVICE_GPU>()(arr_out, arr_in, size);
     }
 }
 
@@ -441,16 +424,16 @@ template <typename FPTYPE_out, typename FPTYPE_in>
 void cast_memory(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size, base_device::AbacusDevice_t device_type_out, base_device::AbacusDevice_t device_type_in)
 {
     if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){
-        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_CPU, DEVICE_CPU>()(cpu_ctx, cpu_ctx, arr_out, arr_in, size);
+        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_CPU, DEVICE_CPU>()(arr_out, arr_in, size);
     }
     else if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){
-        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_CPU, DEVICE_GPU>()(cpu_ctx, gpu_ctx, arr_out, arr_in, size);
+        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_CPU, DEVICE_GPU>()(arr_out, arr_in, size);
     }
     else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){
-        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_GPU, DEVICE_CPU>()(gpu_ctx, cpu_ctx, arr_out, arr_in, size);
+        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_GPU, DEVICE_CPU>()(arr_out, arr_in, size);
     }
     else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){
-        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_GPU, DEVICE_GPU>()(gpu_ctx, gpu_ctx, arr_out, arr_in, size);
+        cast_memory_op<FPTYPE_out, FPTYPE_in, DEVICE_GPU, DEVICE_GPU>()(arr_out, arr_in, size);
     }
 }
 
@@ -458,10 +441,10 @@ template <typename FPTYPE>
 void delete_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type)
 {
     if (device_type == base_device::AbacusDevice_t::CpuDevice){
-        delete_memory_op<FPTYPE, DEVICE_CPU>()(cpu_ctx, arr);
+        delete_memory_op<FPTYPE, DEVICE_CPU>()(arr);
     }
     else if (device_type == base_device::AbacusDevice_t::GpuDevice){
-        delete_memory_op<FPTYPE, DEVICE_GPU>()(gpu_ctx, arr);
+        delete_memory_op<FPTYPE, DEVICE_GPU>()(arr);
     }
 }
 
diff --git a/source/module_base/module_device/memory_op.h b/source/module_base/module_device/memory_op.h
index 14926caf9b..e09294d970 100644
--- a/source/module_base/module_device/memory_op.h
+++ b/source/module_base/module_device/memory_op.h
@@ -18,13 +18,12 @@ struct resize_memory_op
     /// @brief Allocate memory for a given pointer. Note this op will free the pointer first.
     ///
     /// Input Parameters
-    /// \param dev : the type of computing device
     /// \param size : array size
     /// \param record_string : label for memory record
     ///
     /// Output Parameters
     /// \param arr : allocated array
-    void operator()(const Device* dev, FPTYPE*& arr, const size_t size, const char* record_in = nullptr);
+    void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr);
 };
 
 template <typename FPTYPE, typename Device>
@@ -33,13 +32,12 @@ struct set_memory_op
     /// @brief memset for multi-device
     ///
     /// Input Parameters
-    /// \param dev : the type of computing device
     /// \param var : the specified constant value
     /// \param size : array size
     ///
     /// Output Parameters
     /// \param arr : output array initialized by the input value
-    void operator()(const Device* dev, FPTYPE* arr, const int var, const size_t size);
+    void operator()(FPTYPE* arr, const int var, const size_t size);
 };
 
 template <typename FPTYPE, typename Device_out, typename Device_in>
@@ -48,16 +46,12 @@ struct synchronize_memory_op
     /// @brief memcpy for multi-device
     ///
     /// Input Parameters
-    /// \param dev_out : the type of computing device of arr_out
-    /// \param dev_in : the type of computing device of arr_in
     /// \param arr_in : input array
     /// \param size : array size
     ///
     /// Output Parameters
     /// \param arr_out : output array initialized by the input array
-    void operator()(const Device_out* dev_out,
-                    const Device_in* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size);
 };
@@ -68,16 +62,12 @@ struct cast_memory_op
     /// @brief memcpy for multi-device
     ///
     /// Input Parameters
-    /// \param dev_out : the type of computing device of arr_out
-    /// \param dev_in : the type of computing device of arr_in
     /// \param arr_in : input array
     /// \param size : array size
     ///
     /// Output Parameters
     /// \param arr_out : output array initialized by the input array
-    void operator()(const Device_out* dev_out,
-                    const Device_in* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size);
 };
@@ -88,13 +78,12 @@ struct delete_memory_op
     /// @brief free memory for multi-device
     ///
     /// Input Parameters
-    /// \param dev : the type of computing device
     /// \param arr : the input array
-    void operator()(const Device* dev, FPTYPE* arr);
+    void operator()(FPTYPE* arr);
 };
 
 template <typename FPTYPE>
-void resize_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
+void resize_memory(FPTYPE* arr, const size_t size, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
 
 template <typename FPTYPE>
 void set_memory(FPTYPE* arr, const int var, const size_t size, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice);
@@ -113,8 +102,7 @@ void delete_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type = base_d
 template <typename FPTYPE>
 struct resize_memory_op<FPTYPE, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev,
-                    FPTYPE*& arr,
+    void operator()(FPTYPE*& arr,
                     const size_t size,
                     const char* record_in = nullptr);
 };
@@ -122,33 +110,27 @@ struct resize_memory_op<FPTYPE, base_device::DEVICE_GPU>
 template <typename FPTYPE>
 struct set_memory_op<FPTYPE, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr, const int var, const size_t size);
+    void operator()(FPTYPE* arr, const int var, const size_t size);
 };
 
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_CPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size);
 };
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_CPU* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size);
 };
 template <typename FPTYPE>
 struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE* arr_out,
+    void operator()(FPTYPE* arr_out,
                     const FPTYPE* arr_in,
                     const size_t size);
 };
@@ -156,7 +138,7 @@ struct synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVIC
 template <typename FPTYPE>
 struct delete_memory_op<FPTYPE, base_device::DEVICE_GPU>
 {
-    void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr);
+    void operator()(FPTYPE* arr);
 };
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
@@ -168,13 +150,12 @@ struct resize_memory_op_mt
     /// @brief Allocate memory for a given pointer. Note this op will free the pointer first.
     ///
     /// Input Parameters
-    /// \param dev : the type of computing device
     /// \param size : array size
     /// \param record_string : label for memory record
     ///
     /// Output Parameters
     /// \param arr : allocated array
-    void operator()(const Device* dev, FPTYPE*& arr, const size_t size, const char* record_in = nullptr);
+    void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr);
 };
 
 template <typename FPTYPE, typename Device>
@@ -183,9 +164,8 @@ struct delete_memory_op_mt
     /// @brief free memory for multi-device
     ///
     /// Input Parameters
-    /// \param dev : the type of computing device
     /// \param arr : the input array
-    void operator()(const Device* dev, FPTYPE* arr);
+    void operator()(FPTYPE* arr);
 };
 
 #endif // __DSP
diff --git a/source/module_base/module_device/rocm/memory_op.hip.cu b/source/module_base/module_device/rocm/memory_op.hip.cu
index 1909cfb771..7e4cf7f497 100644
--- a/source/module_base/module_device/rocm/memory_op.hip.cu
+++ b/source/module_base/module_device/rocm/memory_op.hip.cu
@@ -39,21 +39,19 @@ __global__ void cast_memory(std::complex<FPTYPE_out>* out, const std::complex<FP
 }
 
 template <typename FPTYPE>
-void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-                                                                   FPTYPE*& arr,
+void resize_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE*& arr,
                                                                    const size_t size,
                                                                    const char* record_in)
 {
     if (arr != nullptr)
     {
-        delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(dev, arr);
+        delete_memory_op<FPTYPE, base_device::DEVICE_GPU>()(arr);
     }
     hipErrcheck(hipMalloc((void**)&arr, sizeof(FPTYPE) * size));
 }
 
 template <typename FPTYPE>
-void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
-                                                                FPTYPE* arr,
+void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr,
                                                                 const int var,
                                                                 const size_t size)
 {
@@ -62,8 +60,6 @@ void set_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_devic
 
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_CPU* dev_out,
-    const base_device::DEVICE_GPU* dev_in,
     FPTYPE* arr_out,
     const FPTYPE* arr_in,
     const size_t size)
@@ -73,8 +69,6 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, base_device::DEVICE_
 
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_CPU>::operator()(
-    const base_device::DEVICE_GPU* dev_out,
-    const base_device::DEVICE_CPU* dev_in,
     FPTYPE* arr_out,
     const FPTYPE* arr_in,
     const size_t size)
@@ -84,8 +78,6 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
 
 template <typename FPTYPE>
 void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_GPU>::operator()(
-    const base_device::DEVICE_GPU* dev_out,
-    const base_device::DEVICE_GPU* dev_in,
     FPTYPE* arr_out,
     const FPTYPE* arr_in,
     const size_t size)
@@ -95,9 +87,7 @@ void synchronize_memory_op<FPTYPE, base_device::DEVICE_GPU, base_device::DEVICE_
 
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_GPU> {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size) {
 
@@ -110,9 +100,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_device::DEVICE_CPU> {
-    void operator()(const base_device::DEVICE_GPU* dev_out,
-                    const base_device::DEVICE_CPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size) {
 
@@ -139,9 +127,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_GPU, base_devic
 
 template <typename FPTYPE_out, typename FPTYPE_in>
 struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_device::DEVICE_GPU> {
-    void operator()(const base_device::DEVICE_CPU* dev_out,
-                    const base_device::DEVICE_GPU* dev_in,
-                    FPTYPE_out* arr_out,
+    void operator()(FPTYPE_out* arr_out,
                     const FPTYPE_in* arr_in,
                     const size_t size) {
 
@@ -166,7 +152,7 @@ struct cast_memory_op<FPTYPE_out, FPTYPE_in, base_device::DEVICE_CPU, base_devic
 };
 
 template <typename FPTYPE>
-void delete_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr)
+void delete_memory_op<FPTYPE, base_device::DEVICE_GPU>::operator()(FPTYPE* arr)
 {
     hipErrcheck(hipFree(arr));
 }
diff --git a/source/module_base/module_device/test/memory_test.cpp b/source/module_base/module_device/test/memory_test.cpp
index 6dc45e5091..39c5c25d52 100644
--- a/source/module_base/module_device/test/memory_test.cpp
+++ b/source/module_base/module_device/test/memory_test.cpp
@@ -91,7 +91,7 @@ class TestModulePsiMemory : public ::testing::Test
 TEST_F(TestModulePsiMemory, set_memory_op_double_cpu)
 {
     std::vector<double> v_xx = xx;
-    set_memory_double_cpu_op()(cpu_ctx, v_xx.data(), 0, xx.size());
+    set_memory_double_cpu_op()(v_xx.data(), 0, xx.size());
     for (int ii = 0; ii < xx.size(); ii++)
     {
         EXPECT_EQ(v_xx[ii], 0.0);
@@ -101,7 +101,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_double_cpu)
 TEST_F(TestModulePsiMemory, set_memory_op_complex_double_cpu)
 {
     std::vector<std::complex<double>> vz_xx = z_xx;
-    set_memory_complex_double_cpu_op()(cpu_ctx, vz_xx.data(), 0, z_xx.size());
+    set_memory_complex_double_cpu_op()(vz_xx.data(), 0, z_xx.size());
     for (int ii = 0; ii < z_xx.size(); ii++)
     {
         EXPECT_EQ(vz_xx[ii], std::complex<double>(0.0, 0.0));
@@ -111,7 +111,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_complex_double_cpu)
 TEST_F(TestModulePsiMemory, resize_memory_op_double_cpu)
 {
     double* xx_tmp = NULL;
-    resize_memory_double_cpu_op()(cpu_ctx, xx_tmp, xx.size());
+    resize_memory_double_cpu_op()(xx_tmp, xx.size());
     for (int ii = 0; ii < xx.size(); ii++)
     {
         xx_tmp[ii] = xx[ii];
@@ -126,7 +126,7 @@ TEST_F(TestModulePsiMemory, resize_memory_op_double_cpu)
 TEST_F(TestModulePsiMemory, resize_memory_op_comlex_double_cpu)
 {
     std::complex<double>* z_xx_tmp = NULL;
-    resize_memory_comlex_double_cpu_op()(cpu_ctx, z_xx_tmp, z_xx.size());
+    resize_memory_comlex_double_cpu_op()(z_xx_tmp, z_xx.size());
     for (int ii = 0; ii < z_xx.size(); ii++)
     {
         z_xx_tmp[ii] = z_xx[ii];
@@ -141,7 +141,7 @@ TEST_F(TestModulePsiMemory, resize_memory_op_comlex_double_cpu)
 TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_cpu)
 {
     std::vector<double> h_xx(xx.size(), 0);
-    synchronize_memory_double_cpu_to_cpu_op()(cpu_ctx, cpu_ctx, h_xx.data(), xx.data(), xx.size());
+    synchronize_memory_double_cpu_to_cpu_op()(h_xx.data(), xx.data(), xx.size());
     for (int ii = 0; ii < z_xx.size(); ii++)
     {
         EXPECT_EQ(h_xx[ii], xx[ii]);
@@ -151,7 +151,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_cpu)
 TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_cpu)
 {
     std::vector<std::complex<double>> hz_xx(z_xx.size(), std::complex<double>(0, 0));
-    synchronize_memory_complex_double_cpu_to_cpu_op()(cpu_ctx, cpu_ctx, hz_xx.data(), z_xx.data(), z_xx.size());
+    synchronize_memory_complex_double_cpu_to_cpu_op()(hz_xx.data(), z_xx.data(), z_xx.size());
     for (int ii = 0; ii < z_xx.size(); ii++)
     {
         EXPECT_EQ(hz_xx[ii], z_xx[ii]);
@@ -161,13 +161,13 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_cpu)
 TEST_F(TestModulePsiMemory, delete_memory_op_double_cpu)
 {
     double* h_xx = (double*)malloc(sizeof(double) * xx.size());
-    delete_memory_double_cpu_op()(cpu_ctx, h_xx);
+    delete_memory_double_cpu_op()(h_xx);
 }
 
 TEST_F(TestModulePsiMemory, delete_memory_op_complex_double_cpu)
 {
     std::complex<double>* hz_xx = (std::complex<double>*)malloc(sizeof(std::complex<double>) * z_xx.size());
-    delete_memory_complex_double_cpu_op()(cpu_ctx, hz_xx);
+    delete_memory_complex_double_cpu_op()(hz_xx);
 }
 
 #if __UT_USE_CUDA || __UT_USE_ROCM
@@ -175,7 +175,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_double_gpu)
 {
     thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
     thrust::copy(xx.begin(), xx.end(), d_xx);
-    set_memory_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(d_xx), 0, xx.size());
+    set_memory_double_gpu_op()(thrust::raw_pointer_cast(d_xx), 0, xx.size());
     thrust::host_vector<double> h_xx(xx.size());
     thrust::copy(d_xx, d_xx + xx.size(), h_xx.begin());
     for (int ii = 0; ii < xx.size(); ii++)
@@ -188,7 +188,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_complex_double_gpu)
 {
     thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
     thrust::copy(z_xx.begin(), z_xx.end(), dz_xx);
-    set_memory_complex_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(dz_xx), 0, z_xx.size());
+    set_memory_complex_double_gpu_op()(thrust::raw_pointer_cast(dz_xx), 0, z_xx.size());
     thrust::host_vector<std::complex<double>> h_xx(z_xx.size());
     thrust::copy(dz_xx, dz_xx + z_xx.size(), h_xx.begin());
     for (int ii = 0; ii < z_xx.size(); ii++)
@@ -200,7 +200,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_complex_double_gpu)
 TEST_F(TestModulePsiMemory, resize_memory_op_double_gpu)
 {
     double* xx_tmp = NULL;
-    resize_memory_double_gpu_op()(gpu_ctx, xx_tmp, xx.size());
+    resize_memory_double_gpu_op()(xx_tmp, xx.size());
 
     thrust::device_ptr<double> d_xx(xx_tmp);
     thrust::copy(xx.begin(), xx.end(), d_xx);
@@ -217,7 +217,7 @@ TEST_F(TestModulePsiMemory, resize_memory_op_double_gpu)
 TEST_F(TestModulePsiMemory, resize_memory_op_complex_double_gpu)
 {
     std::complex<double>* z_xx_tmp = NULL;
-    resize_memory_comlex_double_gpu_op()(gpu_ctx, z_xx_tmp, z_xx.size());
+    resize_memory_comlex_double_gpu_op()(z_xx_tmp, z_xx.size());
 
     thrust::device_ptr<std::complex<double>> dz_xx(z_xx_tmp);
     thrust::copy(z_xx.begin(), z_xx.end(), dz_xx);
@@ -236,7 +236,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_gpu)
     thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
     std::vector<double> hv_xx(xx.size(), 0);
     thrust::copy(hv_xx.begin(), hv_xx.end(), d_xx);
-    synchronize_memory_double_cpu_to_gpu_op()(gpu_ctx, cpu_ctx, thrust::raw_pointer_cast(d_xx), xx.data(), xx.size());
+    synchronize_memory_double_cpu_to_gpu_op()(thrust::raw_pointer_cast(d_xx), xx.data(), xx.size());
 
     thrust::host_vector<double> h_xx(xx.size());
     thrust::copy(d_xx, d_xx + xx.size(), h_xx.begin());
@@ -252,9 +252,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_gpu_to_cpu)
     thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
     thrust::copy(xx.begin(), xx.end(), d_xx);
     thrust::host_vector<double> h_xx(xx.size());
-    synchronize_memory_double_gpu_to_cpu_op()(cpu_ctx,
-                                              gpu_ctx,
-                                              thrust::raw_pointer_cast(h_xx.data()),
+    synchronize_memory_double_gpu_to_cpu_op()(thrust::raw_pointer_cast(h_xx.data()),
                                               thrust::raw_pointer_cast(d_xx),
                                               xx.size());
 
@@ -270,9 +268,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_gpu_to_gpu)
     thrust::device_ptr<double> d1_xx = thrust::device_malloc<double>(xx.size());
     thrust::device_ptr<double> d2_xx = thrust::device_malloc<double>(xx.size());
     thrust::copy(xx.begin(), xx.end(), d1_xx);
-    synchronize_memory_double_gpu_to_gpu_op()(gpu_ctx,
-                                              gpu_ctx,
-                                              thrust::raw_pointer_cast(d2_xx),
+    synchronize_memory_double_gpu_to_gpu_op()(thrust::raw_pointer_cast(d2_xx),
                                               thrust::raw_pointer_cast(d1_xx),
                                               xx.size());
 
@@ -291,9 +287,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_gpu)
     thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
     std::vector<std::complex<double>> hvz_xx(z_xx.size(), 0);
     thrust::copy(hvz_xx.begin(), hvz_xx.end(), dz_xx);
-    synchronize_memory_complex_double_cpu_to_gpu_op()(gpu_ctx,
-                                                      cpu_ctx,
-                                                      thrust::raw_pointer_cast(dz_xx),
+    synchronize_memory_complex_double_cpu_to_gpu_op()(thrust::raw_pointer_cast(dz_xx),
                                                       z_xx.data(),
                                                       z_xx.size());
 
@@ -311,9 +305,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_cpu)
     thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
     thrust::copy(z_xx.begin(), z_xx.end(), dz_xx);
     thrust::host_vector<std::complex<double>> hz_xx(z_xx.size());
-    synchronize_memory_complex_double_gpu_to_cpu_op()(cpu_ctx,
-                                                      gpu_ctx,
-                                                      thrust::raw_pointer_cast(hz_xx.data()),
+    synchronize_memory_complex_double_gpu_to_cpu_op()(thrust::raw_pointer_cast(hz_xx.data()),
                                                       thrust::raw_pointer_cast(dz_xx),
                                                       z_xx.size());
 
@@ -329,9 +321,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_gpu)
     thrust::device_ptr<std::complex<double>> dz1_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
     thrust::device_ptr<std::complex<double>> dz2_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
     thrust::copy(z_xx.begin(), z_xx.end(), dz1_xx);
-    synchronize_memory_complex_double_gpu_to_gpu_op()(gpu_ctx,
-                                                      gpu_ctx,
-                                                      thrust::raw_pointer_cast(dz2_xx),
+    synchronize_memory_complex_double_gpu_to_gpu_op()(thrust::raw_pointer_cast(dz2_xx),
                                                       thrust::raw_pointer_cast(dz1_xx),
                                                       z_xx.size());
 
@@ -348,13 +338,13 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_gpu)
 TEST_F(TestModulePsiMemory, delete_memory_op_double_gpu)
 {
     thrust::device_ptr<double> d_xx = thrust::device_malloc<double>(xx.size());
-    delete_memory_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(d_xx));
+    delete_memory_double_gpu_op()(thrust::raw_pointer_cast(d_xx));
 }
 
 TEST_F(TestModulePsiMemory, delete_memory_op_complex_double_gpu)
 {
     thrust::device_ptr<std::complex<double>> dz_xx = thrust::device_malloc<std::complex<double>>(z_xx.size());
-    delete_memory_complex_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(dz_xx));
+    delete_memory_complex_double_gpu_op()(thrust::raw_pointer_cast(dz_xx));
 }
 
 #endif // __UT_USE_CUDA || __UT_USE_ROCM
diff --git a/source/module_base/parallel_device.h b/source/module_base/parallel_device.h
index 09625f6303..7c41b8f28f 100644
--- a/source/module_base/parallel_device.h
+++ b/source/module_base/parallel_device.h
@@ -37,14 +37,14 @@ void bcast_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm,
     {
         if(tmp_space == nullptr)
         {
-            base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(cpu_ctx, object_cpu, n);
+            base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(object_cpu, n);
             alloc = true;
         }
         else
         {
             object_cpu = tmp_space;
         }
-        base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>()(cpu_ctx, ctx, object_cpu, object, n);
+        base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>()(object_cpu, object, n);
     }
     else
     {
@@ -55,10 +55,10 @@ void bcast_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm,
 
     if (base_device::get_device_type<Device>(ctx) == base_device::GpuDevice)
     {
-        base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>()(ctx, cpu_ctx, object, object_cpu, n);
+        base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>()(object, object_cpu, n);
         if(alloc)
         {
-            base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(cpu_ctx, object_cpu);
+            base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(object_cpu);
         }
     }
     return;
@@ -74,14 +74,14 @@ void reduce_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm
     {
         if(tmp_space == nullptr)
         {
-            base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(cpu_ctx, object_cpu, n);
+            base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(object_cpu, n);
             alloc = true;
         }
         else
         {
             object_cpu = tmp_space;
         }
-        base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>()(cpu_ctx, ctx, object_cpu, object, n);
+        base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>()(object_cpu, object, n);
     }
     else
     {
@@ -92,10 +92,10 @@ void reduce_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm
 
     if (base_device::get_device_type<Device>(ctx) == base_device::GpuDevice)
     {
-        base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>()(ctx, cpu_ctx, object, object_cpu, n);
+        base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>()(object, object_cpu, n);
         if(alloc)
         {
-            base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(cpu_ctx, object_cpu);
+            base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(object_cpu);
         }
     }
     return;
diff --git a/source/module_base/test/blas_connector_test.cpp b/source/module_base/test/blas_connector_test.cpp
index 34f4cb51bb..dfe1e484b1 100644
--- a/source/module_base/test/blas_connector_test.cpp
+++ b/source/module_base/test/blas_connector_test.cpp
@@ -101,17 +101,17 @@ TEST(blas_connector, ScalGpu) {
     const int incx = 1;
     std::complex<double> result[8], answer[8];
     std::complex<double>* result_gpu = nullptr;
-    resmem_zd_op()(gpu_ctx, result_gpu, 8 * sizeof(std::complex<double>));
+    resmem_zd_op()(result_gpu, 8 * sizeof(std::complex<double>));
     for (int i=0; i< size; i++) {
         result[i] = std::complex<double>{static_cast<double>(std::rand() / double(RAND_MAX)),
                  static_cast<double>(std::rand() / double(RAND_MAX))};
     };
     for (int i = 0; i < size; i++)
         answer[i] = result[i] * scale;
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result, sizeof(std::complex<double>) * 8);
+    syncmem_z2z_h2d_op()(result_gpu, result, sizeof(std::complex<double>) * 8);
     BlasConnector::scal(size,scale,result_gpu,incx,base_device::AbacusDevice_t::GpuDevice);
-    syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result, result_gpu, sizeof(std::complex<double>) * 8);
-    delmem_zd_op()(gpu_ctx, result_gpu);
+    syncmem_z2z_d2h_op()(result, result_gpu, sizeof(std::complex<double>) * 8);
+    delmem_zd_op()(result_gpu);
     // incx is the spacing between elements if result
     for (int i = 0; i < size; i++) {
         EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real());
@@ -198,8 +198,8 @@ TEST(blas_connector, AxpyGpu) {
     std::array<T, size> x_const, result, answer;
     T* x_gpu = nullptr;
     T* result_gpu = nullptr;
-    resmem_zd_op()(gpu_ctx, x_gpu, size * sizeof(std::complex<double>));
-    resmem_zd_op()(gpu_ctx, result_gpu, size * sizeof(std::complex<double>));
+    resmem_zd_op()(x_gpu, size * sizeof(std::complex<double>));
+    resmem_zd_op()(result_gpu, size * sizeof(std::complex<double>));
     std::generate(x_const.begin(), x_const.end(), []() {
         return T{static_cast<double>(std::rand() / double(RAND_MAX)),
                  static_cast<double>(std::rand() / double(RAND_MAX))};
@@ -210,12 +210,12 @@ TEST(blas_connector, AxpyGpu) {
     });
     for (int i = 0; i < size; i++)
         answer[i] = x_const[i] * scale + result[i];
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex<double>) * size);
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, x_gpu, x_const.data(), sizeof(std::complex<double>) * size);
+    syncmem_z2z_h2d_op()(result_gpu, result.data(), sizeof(std::complex<double>) * size);
+    syncmem_z2z_h2d_op()(x_gpu, x_const.data(), sizeof(std::complex<double>) * size);
     BlasConnector::axpy(size, scale, x_gpu, incx, result_gpu, incy, base_device::AbacusDevice_t::GpuDevice);
-    syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex<double>) * size);
-    delmem_zd_op()(gpu_ctx, result_gpu);
-    delmem_zd_op()(gpu_ctx, x_gpu);
+    syncmem_z2z_d2h_op()(result.data(), result_gpu, sizeof(std::complex<double>) * size);
+    delmem_zd_op()(result_gpu);
+    delmem_zd_op()(x_gpu);
     for (int i = 0; i < size; i++) {
         EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real());
         EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag());
@@ -640,9 +640,9 @@ TEST(blas_connector, GemmGpu) {
     std::complex<double>* a_gpu = nullptr;
     std::complex<double>* b_gpu = nullptr;
     std::complex<double>* result_gpu = nullptr;
-    resmem_zd_op()(gpu_ctx, a_gpu, size_k * lda * sizeof(std::complex<double>));
-    resmem_zd_op()(gpu_ctx, b_gpu, size_n * ldb * sizeof(std::complex<double>));
-    resmem_zd_op()(gpu_ctx, result_gpu, size_n * ldc * sizeof(std::complex<double>));
+    resmem_zd_op()(a_gpu, size_k * lda * sizeof(std::complex<double>));
+    resmem_zd_op()(b_gpu, size_n * ldb * sizeof(std::complex<double>));
+    resmem_zd_op()(result_gpu, size_n * ldc * sizeof(std::complex<double>));
     std::generate(a_const.begin(), a_const.end(), []() {
         return T{static_cast<double>(std::rand() / double(RAND_MAX)),
                  static_cast<double>(std::rand() / double(RAND_MAX))};
@@ -665,16 +665,16 @@ TEST(blas_connector, GemmGpu) {
                                   beta_const * result[i + j * ldc];
         }
     }
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, a_gpu, a_const.data(), sizeof(std::complex<double>) * size_k * lda);
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, b_gpu, b_const.data(), sizeof(std::complex<double>) * size_n * ldb);
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex<double>) * size_n * ldc);
+    syncmem_z2z_h2d_op()(a_gpu, a_const.data(), sizeof(std::complex<double>) * size_k * lda);
+    syncmem_z2z_h2d_op()(b_gpu, b_const.data(), sizeof(std::complex<double>) * size_n * ldb);
+    syncmem_z2z_h2d_op()(result_gpu, result.data(), sizeof(std::complex<double>) * size_n * ldc);
     BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const,
            a_gpu, lda, b_gpu, ldb, beta_const,
            result_gpu, ldc, base_device::AbacusDevice_t::GpuDevice);
-    syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex<double>) * size_n * ldc);
-    delmem_zd_op()(gpu_ctx, result_gpu);
-    delmem_zd_op()(gpu_ctx, a_gpu);
-    delmem_zd_op()(gpu_ctx, b_gpu);
+    syncmem_z2z_d2h_op()(result.data(), result_gpu, sizeof(std::complex<double>) * size_n * ldc);
+    delmem_zd_op()(result_gpu);
+    delmem_zd_op()(a_gpu);
+    delmem_zd_op()(b_gpu);
     for (int i = 0; i < size_m; i++)
         for (int j = 0; j < size_n; j++) {
             EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(),
diff --git a/source/module_basis/module_pw/kernels/test/pw_op_test.cpp b/source/module_basis/module_pw/kernels/test/pw_op_test.cpp
index 96cc760383..6adac4613f 100644
--- a/source/module_basis/module_pw/kernels/test/pw_op_test.cpp
+++ b/source/module_basis/module_pw/kernels/test/pw_op_test.cpp
@@ -102,43 +102,43 @@ TEST_F(TestModulePWPWMultiDevice, set_3d_fft_box_op_gpu)
     std::vector<std::complex<double>> res(out_1.size(), std::complex<double>{0, 0});
     int * d_box_index = NULL;
     std::complex<double>* d_res = NULL, * d_in_1 = NULL;
-    resize_memory_int_gpu_op()(gpu_ctx, d_box_index, box_index.size());
-    resize_memory_complex_gpu_op()(gpu_ctx, d_res, res.size());
-    resize_memory_complex_gpu_op()(gpu_ctx, d_in_1, in_1.size());
-    synchronize_memory_int_h2d_op()(gpu_ctx, cpu_ctx, d_box_index, box_index.data(), box_index.size());
-    synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
-    synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in_1, in_1.data(), in_1.size());
+    resize_memory_int_gpu_op()(d_box_index, box_index.size());
+    resize_memory_complex_gpu_op()(d_res, res.size());
+    resize_memory_complex_gpu_op()(d_in_1, in_1.size());
+    synchronize_memory_int_h2d_op()(d_box_index, box_index.data(), box_index.size());
+    synchronize_memory_complex_h2d_op()(d_res, res.data(), res.size());
+    synchronize_memory_complex_h2d_op()(d_in_1, in_1.data(), in_1.size());
 
     set_3d_fft_box_gpu_op()(gpu_ctx, this->npwk, d_box_index, d_in_1, d_res);
 
-    synchronize_memory_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
+    synchronize_memory_complex_d2h_op()(res.data(), d_res, res.size());
 
     for (int ii = 0; ii < this->nxyz; ii++) {
         EXPECT_LT(fabs(res[ii] - out_1[ii]), 1e-12);
     }
-    delete_memory_int_gpu_op()(gpu_ctx, d_box_index);
-    delete_memory_complex_gpu_op()(gpu_ctx, d_res);
-    delete_memory_complex_gpu_op()(gpu_ctx, d_in_1);
+    delete_memory_int_gpu_op()(d_box_index);
+    delete_memory_complex_gpu_op()(d_res);
+    delete_memory_complex_gpu_op()(d_in_1);
 }
 
 TEST_F(TestModulePWPWMultiDevice, set_recip_to_real_output_op_gpu)
 {
     std::vector<std::complex<double>> res(out_2.size(), std::complex<double>{0, 0});
     std::complex<double>* d_res = NULL, * d_in_2 = NULL;
-    resize_memory_complex_gpu_op()(gpu_ctx, d_res, res.size());
-    resize_memory_complex_gpu_op()(gpu_ctx, d_in_2, in_2.size());
-    synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
-    synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in_2, in_2.data(), in_2.size());
+    resize_memory_complex_gpu_op()(d_res, res.size());
+    resize_memory_complex_gpu_op()(d_in_2, in_2.size());
+    synchronize_memory_complex_h2d_op()(d_res, res.data(), res.size());
+    synchronize_memory_complex_h2d_op()(d_in_2, in_2.data(), in_2.size());
 
     set_recip_to_real_output_gpu_op()(gpu_ctx, this->nxyz, this->add, this->factor, d_in_2, d_res);
 
-    synchronize_memory_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
+    synchronize_memory_complex_d2h_op()(res.data(), d_res, res.size());
 
     for (int ii = 0; ii < this->nxyz; ii++) {
         EXPECT_LT(fabs(res[ii] - out_2[ii]), 1e-12);
     }
-    delete_memory_complex_gpu_op()(gpu_ctx, d_res);
-    delete_memory_complex_gpu_op()(gpu_ctx, d_in_2);
+    delete_memory_complex_gpu_op()(d_res);
+    delete_memory_complex_gpu_op()(d_in_2);
 }
 
 TEST_F(TestModulePWPWMultiDevice, set_real_to_recip_output_op_gpu)
@@ -146,23 +146,23 @@ TEST_F(TestModulePWPWMultiDevice, set_real_to_recip_output_op_gpu)
     std::vector<std::complex<double>> res = out_3_init;
     int * d_box_index = NULL;
     std::complex<double>* d_res = NULL, * d_in_3 = NULL;
-    resize_memory_int_gpu_op()(gpu_ctx, d_box_index, box_index.size());
-    resize_memory_complex_gpu_op()(gpu_ctx, d_res, res.size());
-    resize_memory_complex_gpu_op()(gpu_ctx, d_in_3, in_3.size());
-    synchronize_memory_int_h2d_op()(gpu_ctx, cpu_ctx, d_box_index, box_index.data(), box_index.size());
-    synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
-    synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in_3, in_3.data(), in_3.size());
+    resize_memory_int_gpu_op()(d_box_index, box_index.size());
+    resize_memory_complex_gpu_op()(d_res, res.size());
+    resize_memory_complex_gpu_op()(d_in_3, in_3.size());
+    synchronize_memory_int_h2d_op()(d_box_index, box_index.data(), box_index.size());
+    synchronize_memory_complex_h2d_op()(d_res, res.data(), res.size());
+    synchronize_memory_complex_h2d_op()(d_in_3, in_3.data(), in_3.size());
 
     set_real_to_recip_output_gpu_op()(gpu_ctx, this->npwk, this->nxyz, true, this->factor, d_box_index, d_in_3, d_res);
 
-    synchronize_memory_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
+    synchronize_memory_complex_d2h_op()(res.data(), d_res, res.size());
 
     for (int ii = 0; ii < out_3.size(); ii++) {
         EXPECT_LT(fabs(res[ii] - out_3[ii]), 5e-6);
     }
-    delete_memory_int_gpu_op()(gpu_ctx, d_box_index);
-    delete_memory_complex_gpu_op()(gpu_ctx, d_res);
-    delete_memory_complex_gpu_op()(gpu_ctx, d_in_3);
+    delete_memory_int_gpu_op()(d_box_index);
+    delete_memory_complex_gpu_op()(d_res);
+    delete_memory_complex_gpu_op()(d_in_3);
 }
 
 #endif // __UT_USE_CUDA || __UT_USE_ROCM
diff --git a/source/module_basis/module_pw/module_fft/fft_cuda.cpp b/source/module_basis/module_pw/module_fft/fft_cuda.cpp
index db93fb07fb..9bec9253e7 100644
--- a/source/module_basis/module_pw/module_fft/fft_cuda.cpp
+++ b/source/module_basis/module_pw/module_fft/fft_cuda.cpp
@@ -17,14 +17,14 @@ template <>
 void FFT_CUDA<float>::setupFFT()
 {
     cufftPlan3d(&c_handle, this->nx, this->ny, this->nz, CUFFT_C2C);
-    resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz);
+    resmem_cd_op()(this->c_auxr_3d, this->nx * this->ny * this->nz);
         
 }
 template <>  
 void FFT_CUDA<double>::setupFFT()
 {
     cufftPlan3d(&z_handle, this->nx, this->ny, this->nz, CUFFT_Z2Z);
-    resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz);
+    resmem_zd_op()(this->z_auxr_3d, this->nx * this->ny * this->nz);
 }
 template <>
 void FFT_CUDA<float>::cleanFFT()
@@ -50,7 +50,7 @@ void FFT_CUDA<float>::clear()
     this->cleanFFT();
     if (c_auxr_3d != nullptr)
     {
-        delmem_cd_op()(gpu_ctx, c_auxr_3d);
+        delmem_cd_op()(c_auxr_3d);
         c_auxr_3d = nullptr;
     }
 }
@@ -60,7 +60,7 @@ void FFT_CUDA<double>::clear()
     this->cleanFFT();
     if (z_auxr_3d != nullptr)
     {
-        delmem_zd_op()(gpu_ctx, z_auxr_3d);
+        delmem_zd_op()(z_auxr_3d);
         z_auxr_3d = nullptr;
     }
 }
diff --git a/source/module_basis/module_pw/module_fft/fft_rocm.cpp b/source/module_basis/module_pw/module_fft/fft_rocm.cpp
index 9973c72901..1dd9c433ec 100644
--- a/source/module_basis/module_pw/module_fft/fft_rocm.cpp
+++ b/source/module_basis/module_pw/module_fft/fft_rocm.cpp
@@ -16,14 +16,14 @@ template <>
 void FFT_ROCM<float>::setupFFT()
 {
     hipfftPlan3d(&c_handle, this->nx, this->ny, this->nz, HIPFFT_C2C);
-    resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz);
+    resmem_cd_op()(this->c_auxr_3d, this->nx * this->ny * this->nz);
         
 }
 template <>  
 void FFT_ROCM<double>::setupFFT()
 {
     hipfftPlan3d(&z_handle, this->nx, this->ny, this->nz, HIPFFT_Z2Z);
-    resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz);
+    resmem_zd_op()(this->z_auxr_3d, this->nx * this->ny * this->nz);
 }
 template <>
 void FFT_ROCM<float>::cleanFFT()
@@ -49,7 +49,7 @@ void FFT_ROCM<float>::clear()
     this->cleanFFT();
     if (c_auxr_3d != nullptr)
     {
-        delmem_cd_op()(gpu_ctx, c_auxr_3d);
+        delmem_cd_op()(c_auxr_3d);
         c_auxr_3d = nullptr;
     }
 }
@@ -59,7 +59,7 @@ void FFT_ROCM<double>::clear()
     this->cleanFFT();
     if (z_auxr_3d != nullptr)
     {
-        delmem_zd_op()(gpu_ctx, z_auxr_3d);
+        delmem_zd_op()(z_auxr_3d);
         z_auxr_3d = nullptr;
     }
 }
diff --git a/source/module_basis/module_pw/pw_basis.cpp b/source/module_basis/module_pw/pw_basis.cpp
index 7c8a1293da..5fbff68f0c 100644
--- a/source/module_basis/module_pw/pw_basis.cpp
+++ b/source/module_basis/module_pw/pw_basis.cpp
@@ -39,7 +39,7 @@ PW_Basis:: ~PW_Basis()
     delete[] gg_uniq;
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
-        delmem_int_op()(gpu_ctx, this->d_is2fftixy);
+        delmem_int_op()(this->d_is2fftixy);
     }
 #endif
 }
diff --git a/source/module_basis/module_pw/pw_basis_k.cpp b/source/module_basis/module_pw/pw_basis_k.cpp
index f670ee9bf9..2e0f85372d 100644
--- a/source/module_basis/module_pw/pw_basis_k.cpp
+++ b/source/module_basis/module_pw/pw_basis_k.cpp
@@ -25,24 +25,24 @@ PW_Basis_K::~PW_Basis_K()
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
         if (this->precision == "single") {
-            delmem_sd_op()(gpu_ctx, this->s_kvec_c);
-            delmem_sd_op()(gpu_ctx, this->s_gcar);
-            delmem_sd_op()(gpu_ctx, this->s_gk2);
+            delmem_sd_op()(this->s_kvec_c);
+            delmem_sd_op()(this->s_gcar);
+            delmem_sd_op()(this->s_gk2);
         }
         else {
-            delmem_dd_op()(gpu_ctx, this->d_gcar);
-            delmem_dd_op()(gpu_ctx, this->d_gk2);
+            delmem_dd_op()(this->d_gcar);
+            delmem_dd_op()(this->d_gk2);
         }
-        delmem_dd_op()(gpu_ctx, this->d_kvec_c);
-        delmem_int_op()(gpu_ctx, this->ig2ixyz_k);
-        delmem_int_op()(gpu_ctx, this->d_igl2isz_k);
+        delmem_dd_op()(this->d_kvec_c);
+        delmem_int_op()(this->ig2ixyz_k);
+        delmem_int_op()(this->d_igl2isz_k);
     }
     else {
 #endif
         if (this->precision == "single") {
-            delmem_sh_op()(cpu_ctx, this->s_kvec_c);
-            delmem_sh_op()(cpu_ctx, this->s_gcar);
-            delmem_sh_op()(cpu_ctx, this->s_gk2);
+            delmem_sh_op()(this->s_kvec_c);
+            delmem_sh_op()(this->s_gcar);
+            delmem_sh_op()(this->s_gk2);
         }
         // There's no need to delete double pointers while in a CPU environment.
 #if defined(__CUDA) || defined(__ROCM)
@@ -99,17 +99,17 @@ void PW_Basis_K:: initparameters(
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
         if (this->precision == "single") {
-            resmem_sd_op()(gpu_ctx, this->s_kvec_c, this->nks * 3);
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_kvec_c, reinterpret_cast<double *>(&this->kvec_c[0][0]), this->nks * 3);
+            resmem_sd_op()(this->s_kvec_c, this->nks * 3);
+            castmem_d2s_h2d_op()(this->s_kvec_c, reinterpret_cast<double *>(&this->kvec_c[0][0]), this->nks * 3);
         }
-        resmem_dd_op()(gpu_ctx, this->d_kvec_c, this->nks * 3);
-        syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_kvec_c, reinterpret_cast<double *>(&this->kvec_c[0][0]), this->nks * 3);
+        resmem_dd_op()(this->d_kvec_c, this->nks * 3);
+        syncmem_d2d_h2d_op()(this->d_kvec_c, reinterpret_cast<double *>(&this->kvec_c[0][0]), this->nks * 3);
     }
     else {
 #endif
         if (this->precision == "single") {
-            resmem_sh_op()(cpu_ctx, this->s_kvec_c, this->nks * 3);
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_kvec_c, reinterpret_cast<double *>(&this->kvec_c[0][0]), this->nks * 3);
+            resmem_sh_op()(this->s_kvec_c, this->nks * 3);
+            castmem_d2s_h2h_op()(this->s_kvec_c, reinterpret_cast<double *>(&this->kvec_c[0][0]), this->nks * 3);
         }
         this->d_kvec_c = reinterpret_cast<double *>(&this->kvec_c[0][0]);
         // There's no need to allocate double pointers while in a CPU environment.
@@ -164,8 +164,8 @@ void PW_Basis_K::setupIndGk()
     }
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
-        resmem_int_op()(gpu_ctx, this->d_igl2isz_k, this->npwk_max * this->nks);
-        syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_igl2isz_k, this->igl2isz_k, this->npwk_max * this->nks);
+        resmem_int_op()(this->d_igl2isz_k, this->npwk_max * this->nks);
+        syncmem_int_h2d_op()(this->d_igl2isz_k, this->igl2isz_k, this->npwk_max * this->nks);
     }
 #endif
     this->get_ig2ixyz_k();
@@ -247,25 +247,25 @@ void PW_Basis_K::collect_local_pw(const double& erf_ecut_in, const double& erf_h
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
         if (this->precision == "single") {
-            resmem_sd_op()(gpu_ctx, this->s_gk2, this->npwk_max * this->nks);
-            resmem_sd_op()(gpu_ctx, this->s_gcar, this->npwk_max * this->nks * 3);
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_gk2, this->gk2, this->npwk_max * this->nks);
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_gcar, reinterpret_cast<double *>(&this->gcar[0][0]), this->npwk_max * this->nks * 3);
+            resmem_sd_op()(this->s_gk2, this->npwk_max * this->nks);
+            resmem_sd_op()(this->s_gcar, this->npwk_max * this->nks * 3);
+            castmem_d2s_h2d_op()(this->s_gk2, this->gk2, this->npwk_max * this->nks);
+            castmem_d2s_h2d_op()(this->s_gcar, reinterpret_cast<double *>(&this->gcar[0][0]), this->npwk_max * this->nks * 3);
         }
         else {
-            resmem_dd_op()(gpu_ctx, this->d_gk2, this->npwk_max * this->nks);
-            resmem_dd_op()(gpu_ctx, this->d_gcar, this->npwk_max * this->nks * 3);
-            syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_gk2, this->gk2, this->npwk_max * this->nks);
-            syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_gcar, reinterpret_cast<double *>(&this->gcar[0][0]), this->npwk_max * this->nks * 3);
+            resmem_dd_op()(this->d_gk2, this->npwk_max * this->nks);
+            resmem_dd_op()(this->d_gcar, this->npwk_max * this->nks * 3);
+            syncmem_d2d_h2d_op()(this->d_gk2, this->gk2, this->npwk_max * this->nks);
+            syncmem_d2d_h2d_op()(this->d_gcar, reinterpret_cast<double *>(&this->gcar[0][0]), this->npwk_max * this->nks * 3);
         }
     }
     else {
 #endif
         if (this->precision == "single") {
-            resmem_sh_op()(cpu_ctx, this->s_gk2, this->npwk_max * this->nks, "PW_B_K::s_gk2");
-            resmem_sh_op()(cpu_ctx, this->s_gcar, this->npwk_max * this->nks * 3, "PW_B_K::s_gcar");
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_gk2, this->gk2, this->npwk_max * this->nks);
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_gcar, reinterpret_cast<double *>(&this->gcar[0][0]), this->npwk_max * this->nks * 3);
+            resmem_sh_op()(this->s_gk2, this->npwk_max * this->nks, "PW_B_K::s_gk2");
+            resmem_sh_op()(this->s_gcar, this->npwk_max * this->nks * 3, "PW_B_K::s_gcar");
+            castmem_d2s_h2h_op()(this->s_gk2, this->gk2, this->npwk_max * this->nks);
+            castmem_d2s_h2h_op()(this->s_gcar, reinterpret_cast<double *>(&this->gcar[0][0]), this->npwk_max * this->nks * 3);
         }
         else {
             this->d_gcar = reinterpret_cast<double *>(&this->gcar[0][0]);
@@ -355,8 +355,8 @@ void PW_Basis_K::get_ig2ixyz_k()
             ig2ixyz_k_cpu[igl + ik * npwk_max] = iz + iy * nz + ix * ny * nz;
         }
     }
-    resmem_int_op()(gpu_ctx, ig2ixyz_k, this->npwk_max * this->nks);
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->ig2ixyz_k, ig2ixyz_k_cpu, this->npwk_max * this->nks);
+    resmem_int_op()(ig2ixyz_k, this->npwk_max * this->nks);
+    syncmem_int_h2d_op()(this->ig2ixyz_k, ig2ixyz_k_cpu, this->npwk_max * this->nks);
     delete[] ig2ixyz_k_cpu;
 }
 
diff --git a/source/module_basis/module_pw/pw_basis_sup.cpp b/source/module_basis/module_pw/pw_basis_sup.cpp
index 1d34682a96..e5422bd5d3 100644
--- a/source/module_basis/module_pw/pw_basis_sup.cpp
+++ b/source/module_basis/module_pw/pw_basis_sup.cpp
@@ -100,8 +100,9 @@ void PW_Basis_Sup::distribution_method3(const ModulePW::PW_Basis* pw_rho)
     this->npw_per = new int[this->poolnproc]; // number of planewaves on each core.
     delete[] this->fftixy2ip;
     this->fftixy2ip = new int[this->fftnxy]; // ip of core which contains the stick on (x, y).
-    for (int ixy = 0; ixy < this->fftnxy; ++ixy)
+    for (int ixy = 0; ixy < this->fftnxy; ++ixy) {
         this->fftixy2ip[ixy] = -1; // meaning this stick has not been distributed or there is no stick on (x, y).
+}
     if (poolrank == 0)
     {
         // (1) Count the total number of planewaves (tot_npw) and sticks (this->nstot).
@@ -212,10 +213,11 @@ void PW_Basis_Sup::divide_sticks_3(
     int fftnx_s = nx_s;
     if (this->gamma_only)
     {
-        if (this->xprime)
+        if (this->xprime) {
             fftnx_s = int(nx_s / 2) + 1;
-        else
+        } else {
             fftny_s = int(ny_s / 2) + 1;
+}
     }
 
     int fftnxy_s = fftnx_s * fftny_s;
@@ -225,15 +227,19 @@ void PW_Basis_Sup::divide_sticks_3(
     {
         int ix = ixy / fftny_s;
         int iy = ixy % fftny_s;
-        if (ix >= int(nx_s / 2) + 1)
+        if (ix >= int(nx_s / 2) + 1) {
             ix -= nx_s;
-        if (iy >= int(ny_s / 2) + 1)
+}
+        if (iy >= int(ny_s / 2) + 1) {
             iy -= ny_s;
+}
 
-        if (ix < 0)
+        if (ix < 0) {
             ix += nx;
-        if (iy < 0)
+}
+        if (iy < 0) {
             iy += ny;
+}
         int index = ix * this->fftny + iy;
         int ip = fftixy2ip_s[ixy];
         if (ip >= 0)
@@ -312,7 +318,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
 #if defined(__CUDA) || defined(__ROCM)
         if (this->device == "gpu")
         {
-            delmem_int_op()(gpu_ctx, this->d_is2fftixy);
+            delmem_int_op()(this->d_is2fftixy);
             d_is2fftixy = nullptr;
         }
 #endif
@@ -349,8 +355,9 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
             fftixy2is[ixy] = st_move;
             st_move++;
         }
-        if (st_move == this->nst)
+        if (st_move == this->nst) {
             break;
+}
     }
 
     // distribute planewaves in the same order as smooth grids first.
@@ -363,19 +370,25 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
         int ixy = pw_rho->is2fftixy[is];
         int ix = ixy / pw_rho->fftny;
         int iy = ixy % pw_rho->fftny;
-        if (ix >= int(pw_rho->nx / 2) + 1)
+        if (ix >= int(pw_rho->nx / 2) + 1) {
             ix -= pw_rho->nx;
-        if (iy >= int(pw_rho->ny / 2) + 1)
+}
+        if (iy >= int(pw_rho->ny / 2) + 1) {
             iy -= pw_rho->ny;
-        if (iz >= int(pw_rho->nz / 2) + 1)
+}
+        if (iz >= int(pw_rho->nz / 2) + 1) {
             iz -= pw_rho->nz;
+}
 
-        if (ix < 0)
+        if (ix < 0) {
             ix += this->nx;
-        if (iy < 0)
+}
+        if (iy < 0) {
             iy += this->ny;
-        if (iz < 0)
+}
+        if (iz < 0) {
             iz += this->nz;
+}
         int ixy_now = ix * this->fftny + iy;
         int index = ixy_now * this->nz + iz;
         int is_now = fftixy2is[ixy_now];
@@ -383,8 +396,9 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
         this->ig2isz[ig] = isz_now;
         pw_filled++;
         found[index] = true;
-        if (xprime && ix == 0)
+        if (xprime && ix == 0) {
             ng_xeq0++;
+}
     }
     assert(pw_filled == pw_rho->npw);
 
@@ -397,21 +411,24 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
             for (int iz = zstart; iz < zstart + st_length2D[ixy]; ++iz)
             {
                 int z = iz;
-                if (z < 0)
+                if (z < 0) {
                     z += this->nz;
+}
                 if (!found[ixy * this->nz + z])
                 {
                     found[ixy * this->nz + z] = true;
                     int is = fftixy2is[ixy];
                     this->ig2isz[pw_filled] = is * this->nz + z;
                     pw_filled++;
-                    if (xprime && ixy / fftny == 0)
+                    if (xprime && ixy / fftny == 0) {
                         ng_xeq0++;
+}
                 }
             }
         }
-        if (pw_filled == this->npw)
+        if (pw_filled == this->npw) {
             break;
+}
     }
 
     delete[] fftixy2is;
@@ -420,8 +437,8 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy(
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu")
     {
-        resmem_int_op()(gpu_ctx, d_is2fftixy, this->nst);
-        syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_is2fftixy, this->is2fftixy, this->nst);
+        resmem_int_op()(d_is2fftixy, this->nst);
+        syncmem_int_h2d_op()(this->d_is2fftixy, this->is2fftixy, this->nst);
     }
 #endif
     return;
diff --git a/source/module_basis/module_pw/pw_distributeg.cpp b/source/module_basis/module_pw/pw_distributeg.cpp
index c93ff9357a..0e92d6f665 100644
--- a/source/module_basis/module_pw/pw_distributeg.cpp
+++ b/source/module_basis/module_pw/pw_distributeg.cpp
@@ -101,8 +101,10 @@ void PW_Basis::count_pw_st(
             // so that its index in st_length and st_bottom is 9 * 10 + 2 = 92.
             int x = ix;
             int y = iy;
-            if (x < 0) x += this->nx;
-            if (y < 0) y += this->ny;
+            if (x < 0) { x += this->nx;
+}
+            if (y < 0) { y += this->ny;
+}
             int index = x * this->fftny + y;
 
             int length = 0; // number of planewave on stick (x, y).
@@ -114,13 +116,18 @@ void PW_Basis::count_pw_st(
                 double modulus = f * (this->GGT * f);
                 if (modulus <= this->ggecut || this->full_pw)
                 {
-                    if (length == 0) st_bottom2D[index] = iz; // length == 0 means this point is the bottom of stick (x, y).
+                    if (length == 0) { st_bottom2D[index] = iz; // length == 0 means this point is the bottom of stick (x, y).
+}
                     ++this->npwtot;
                     ++length;
-                    if(iy < this->riy) this->riy = iy;
-                    if(iy > this->liy) this->liy = iy;
-                    if(ix < this->rix) this->rix = ix;
-                    if(ix > this->lix) this->lix = ix;
+                    if(iy < this->riy) { this->riy = iy;
+}
+                    if(iy > this->liy) { this->liy = iy;
+}
+                    if(ix < this->rix) { this->rix = ix;
+}
+                    if(ix > this->lix) { this->lix = ix;
+}
                 }
             }
             if (length > 0)
@@ -157,7 +164,7 @@ void PW_Basis::get_ig2isz_is2fftixy(
         delete[] this->is2fftixy; this->is2fftixy = nullptr; // map is (index of sticks) to ixy (iy + ix * fftny).
 #if defined(__CUDA) || defined(__ROCM)
         if (this->device == "gpu") {
-            delmem_int_op()(gpu_ctx, this->d_is2fftixy);
+            delmem_int_op()(this->d_is2fftixy);
             d_is2fftixy = nullptr;
         }
 #endif
@@ -182,20 +189,23 @@ void PW_Basis::get_ig2isz_is2fftixy(
             for (int iz = zstart; iz < zstart + st_length2D[ixy]; ++iz)
             {
                 int z = iz;
-                if (z < 0) z += this->nz;
+                if (z < 0) { z += this->nz;
+}
                 this->ig2isz[pw_filled] = st_move * this->nz + z;
                 pw_filled++;
             }
             this->is2fftixy[st_move] = ixy;
             st_move++;
-            if(xprime && ixy/fftny == 0) ng_xeq0 = pw_filled;
+            if(xprime && ixy/fftny == 0) { ng_xeq0 = pw_filled;
+}
         }
-        if (st_move == this->nst && pw_filled == this->npw) break;
+        if (st_move == this->nst && pw_filled == this->npw) { break;
+}
     }
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
-        resmem_int_op()(gpu_ctx, d_is2fftixy, this->nst);
-        syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_is2fftixy, this->is2fftixy, this->nst);
+        resmem_int_op()(d_is2fftixy, this->nst);
+        syncmem_int_h2d_op()(this->d_is2fftixy, this->is2fftixy, this->nst);
     }
 #endif
     return;
diff --git a/source/module_basis/module_pw/pw_transform_k.cpp b/source/module_basis/module_pw/pw_transform_k.cpp
index 5e3780eef4..e230066c8f 100644
--- a/source/module_basis/module_pw/pw_transform_k.cpp
+++ b/source/module_basis/module_pw/pw_transform_k.cpp
@@ -345,8 +345,6 @@ void PW_Basis_K::real_to_recip(const base_device::DEVICE_GPU* ctx,
     assert(this->poolnproc == 1);
 
     base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(
-        ctx,
-        ctx,
         this->fft_bundle.get_auxr_3d_data<float>(),
         in,
         this->nrxx);
@@ -379,9 +377,7 @@ void PW_Basis_K::real_to_recip(const base_device::DEVICE_GPU* ctx,
 
     base_device::memory::synchronize_memory_op<std::complex<double>,
                                                base_device::DEVICE_GPU,
-                                               base_device::DEVICE_GPU>()(ctx,
-                                                                          ctx,
-                                                                          this->fft_bundle.get_auxr_3d_data<double>(),
+                                               base_device::DEVICE_GPU>()(this->fft_bundle.get_auxr_3d_data<double>(),
                                                                           in,
                                                                           this->nrxx);
 
@@ -413,7 +409,6 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_GPU* ctx,
     assert(this->poolnproc == 1);
     // ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxr_3d_data<float>(), this->nxyz);
     base_device::memory::set_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(
-        ctx,
         this->fft_bundle.get_auxr_3d_data<float>(),
         0,
         this->nxyz);
@@ -450,7 +445,6 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_GPU* ctx,
     assert(this->poolnproc == 1);
     // ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxr_3d_data<double>(), this->nxyz);
     base_device::memory::set_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(
-        ctx,
         this->fft_bundle.get_auxr_3d_data<double>(),
         0,
         this->nxyz);
diff --git a/source/module_elecstate/elecstate_pw.cpp b/source/module_elecstate/elecstate_pw.cpp
index f55f2ec447..f241c59db8 100644
--- a/source/module_elecstate/elecstate_pw.cpp
+++ b/source/module_elecstate/elecstate_pw.cpp
@@ -33,26 +33,26 @@ ElecStatePW<T, Device>::~ElecStatePW()
 {
     if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single")
     {
-        delmem_var_op()(this->ctx, this->rho_data);
+        delmem_var_op()(this->rho_data);
         delete[] this->rho;
 
         if (PARAM.globalv.double_grid || PARAM.globalv.use_uspp)
         {
-            delmem_complex_op()(this->ctx, this->rhog_data);
+            delmem_complex_op()(this->rhog_data);
             delete[] this->rhog;
         }
         if (get_xc_func_type() == 3 || PARAM.inp.out_elf[0] > 0)
         {
-            delmem_var_op()(this->ctx, this->kin_r_data);
+            delmem_var_op()(this->kin_r_data);
             delete[] this->kin_r;
         }
     }
     if (PARAM.globalv.use_uspp)
     {
-        delmem_var_op()(this->ctx, this->becsum);
+        delmem_var_op()(this->becsum);
     }
-    delmem_complex_op()(this->ctx, this->wfcr);
-    delmem_complex_op()(this->ctx, this->wfcr_another_spin);
+    delmem_complex_op()(this->wfcr);
+    delmem_complex_op()(this->wfcr_another_spin);
 }
 
 template<typename T, typename Device>
@@ -66,7 +66,7 @@ void ElecStatePW<T, Device>::init_rho_data()
     if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single")
     {
         this->rho = new Real*[this->charge->nspin];
-        resmem_var_op()(this->ctx, this->rho_data, this->charge->nspin * this->charge->nrxx);
+        resmem_var_op()(this->rho_data, this->charge->nspin * this->charge->nrxx);
         for (int ii = 0; ii < this->charge->nspin; ii++)
         {
             this->rho[ii] = this->rho_data + ii * this->charge->nrxx;
@@ -74,7 +74,7 @@ void ElecStatePW<T, Device>::init_rho_data()
         if (PARAM.globalv.double_grid || PARAM.globalv.use_uspp)
         {
             this->rhog = new T*[this->charge->nspin];
-            resmem_complex_op()(this->ctx, this->rhog_data, this->charge->nspin * this->charge->rhopw->npw);
+            resmem_complex_op()(this->rhog_data, this->charge->nspin * this->charge->rhopw->npw);
             for (int ii = 0; ii < this->charge->nspin; ii++)
             {
                 this->rhog[ii] = this->rhog_data + ii * this->charge->rhopw->npw;
@@ -83,7 +83,7 @@ void ElecStatePW<T, Device>::init_rho_data()
         if (get_xc_func_type() == 3 || PARAM.inp.out_elf[0] > 0)
         {
             this->kin_r = new Real*[this->charge->nspin];
-            resmem_var_op()(this->ctx, this->kin_r_data, this->charge->nspin * this->charge->nrxx);
+            resmem_var_op()(this->kin_r_data, this->charge->nspin * this->charge->nrxx);
             for (int ii = 0; ii < this->charge->nspin; ii++) {
                 this->kin_r[ii] = this->kin_r_data + ii * this->charge->nrxx;
             }
@@ -101,8 +101,8 @@ void ElecStatePW<T, Device>::init_rho_data()
             this->kin_r = reinterpret_cast<Real **>(this->charge->kin_r);
         }
     }
-    resmem_complex_op()(this->ctx, this->wfcr, this->basis->nmaxgr, "ElecSPW::wfcr");
-    resmem_complex_op()(this->ctx, this->wfcr_another_spin, this->basis->nrxx, "ElecSPW::wfcr_a");
+    resmem_complex_op()(this->wfcr, this->basis->nmaxgr, "ElecSPW::wfcr");
+    resmem_complex_op()(this->wfcr_another_spin, this->basis->nrxx, "ElecSPW::wfcr_a");
     this->init_rho = true;
 }
 
@@ -118,15 +118,15 @@ void ElecStatePW<T, Device>::psiToRho(const psi::Psi<T, Device>& psi)
 	{
         // denghui replaced at 20221110
 		// ModuleBase::GlobalFunc::ZEROS(this->rho[is], this->charge->nrxx);
-        setmem_var_op()(this->ctx, this->rho[is], 0,  this->charge->nrxx);
+        setmem_var_op()(this->rho[is], 0,  this->charge->nrxx);
         if (get_xc_func_type() == 3)
         {
             // ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx);
-            setmem_var_op()(this->ctx, this->kin_r[is], 0,  this->charge->nrxx);
+            setmem_var_op()(this->kin_r[is], 0,  this->charge->nrxx);
         }
         if (PARAM.globalv.double_grid || PARAM.globalv.use_uspp)
         {
-            setmem_complex_op()(this->ctx, this->rhog[is], 0, this->charge->rhopw->npw);
+            setmem_complex_op()(this->rhog[is], 0, this->charge->rhopw->npw);
         }
     }
 
@@ -142,10 +142,10 @@ void ElecStatePW<T, Device>::psiToRho(const psi::Psi<T, Device>& psi)
     {
         for (int ii = 0; ii < PARAM.inp.nspin; ii++)
         {
-            castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->rho[ii], this->rho[ii], this->charge->nrxx);
+            castmem_var_d2h_op()(this->charge->rho[ii], this->rho[ii], this->charge->nrxx);
             if (get_xc_func_type() == 3)
             {
-                castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx);
+                castmem_var_d2h_op()(this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx);
             }
         }
     }
@@ -244,7 +244,7 @@ void ElecStatePW<T, Device>::rhoBandK(const psi::Psi<T, Device>& psi)
             {
                 for (int j = 0; j < 3; j++)
                 {
-                    setmem_complex_op()(this->ctx, this->wfcr, 0,  this->charge->nrxx);
+                    setmem_complex_op()(this->wfcr, 0,  this->charge->nrxx);
 
                     meta_op()(this->ctx,
                               ik,
@@ -277,10 +277,10 @@ void ElecStatePW<T, Device>::cal_becsum(const psi::Psi<T, Device>& psi)
     const int nkb = this->ppcell->nkb;
     this->vkb = this->ppcell->template get_vkb_data<Real>();
     T* becp = nullptr;
-    resmem_complex_op()(this->ctx, becp, nbands * nkb, "ElecState<PW>::becp");
+    resmem_complex_op()(becp, nbands * nkb, "ElecState<PW>::becp");
     const int nh_tot = this->ppcell->nhm * (this->ppcell->nhm + 1) / 2;
-    resmem_var_op()(this->ctx, becsum, nh_tot * ucell->nat * PARAM.inp.nspin, "ElecState<PW>::becsum");
-    setmem_var_op()(this->ctx, becsum, 0, nh_tot * ucell->nat * PARAM.inp.nspin);
+    resmem_var_op()(becsum, nh_tot * ucell->nat * PARAM.inp.nspin, "ElecState<PW>::becsum");
+    setmem_var_op()(becsum, 0, nh_tot * ucell->nat * PARAM.inp.nspin);
 
     for (int ik = 0; ik < psi.get_nk(); ++ik)
     {
@@ -340,10 +340,9 @@ void ElecStatePW<T, Device>::cal_becsum(const psi::Psi<T, Device>& psi)
             if (atom->ncpp.tvanp)
             {
                 T *auxk1 = nullptr, *auxk2 = nullptr, *aux_gk = nullptr;
-                resmem_complex_op()(this->ctx, auxk1, nbands * atom->ncpp.nh, "ElecState<PW>::auxk1");
-                resmem_complex_op()(this->ctx, auxk2, nbands * atom->ncpp.nh, "ElecState<PW>::auxk2");
-                resmem_complex_op()(this->ctx,
-                                    aux_gk,
+                resmem_complex_op()(auxk1, nbands * atom->ncpp.nh, "ElecState<PW>::auxk1");
+                resmem_complex_op()(auxk2, nbands * atom->ncpp.nh, "ElecState<PW>::auxk2");
+                resmem_complex_op()(aux_gk,
                                     atom->ncpp.nh * atom->ncpp.nh * npol * npol,
                                     "ElecState<PW>::aux_gk");
                 for (int ia = 0; ia < atom->na; ia++)
@@ -414,13 +413,13 @@ void ElecStatePW<T, Device>::cal_becsum(const psi::Psi<T, Device>& psi)
                         }
                     }
                 }
-                delmem_complex_op()(this->ctx, auxk1);
-                delmem_complex_op()(this->ctx, auxk2);
-                delmem_complex_op()(this->ctx, aux_gk);
+                delmem_complex_op()(auxk1);
+                delmem_complex_op()(auxk2);
+                delmem_complex_op()(aux_gk);
             }
         }
     }
-    delmem_complex_op()(this->ctx, becp);
+    delmem_complex_op()(becp);
 }
 
 template <typename T, typename Device>
@@ -469,11 +468,11 @@ void ElecStatePW<T, Device>::addusdens_g(const Real* becsum, T** rhog)
     const std::complex<double> ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI;
 
     Real* qmod = nullptr;
-    resmem_var_op()(this->ctx, qmod, npw, "ElecState<PW>::qmod");
+    resmem_var_op()(qmod, npw, "ElecState<PW>::qmod");
     T* qgm = nullptr;
-    resmem_complex_op()(this->ctx, qgm, npw, "ElecState<PW>::qgm");
+    resmem_complex_op()(qgm, npw, "ElecState<PW>::qgm");
     Real* ylmk0 = nullptr;
-    resmem_var_op()(this->ctx, ylmk0, npw * lmaxq * lmaxq, "ElecState<PW>::ylmk0");
+    resmem_var_op()(ylmk0, npw * lmaxq * lmaxq, "ElecState<PW>::ylmk0");
     Real* g = reinterpret_cast<Real*>(this->charge->rhopw->gcar);
 
     ModuleBase::YlmReal::Ylm_Real(this->ctx, lmaxq * lmaxq, npw, g, ylmk0);
@@ -492,9 +491,9 @@ void ElecStatePW<T, Device>::addusdens_g(const Real* becsum, T** rhog)
             const int nij = atom->ncpp.nh * (atom->ncpp.nh + 1) / 2;
 
             T *skk = nullptr, *aux2 = nullptr, *tbecsum = nullptr;
-            resmem_complex_op()(this->ctx, skk, atom->na * npw, "ElecState<PW>::skk");
-            resmem_complex_op()(this->ctx, aux2, nij * npw, "ElecState<PW>::aux2");
-            resmem_complex_op()(this->ctx, tbecsum, PARAM.inp.nspin * atom->na * nij, "ElecState<PW>::tbecsum");
+            resmem_complex_op()(skk, atom->na * npw, "ElecState<PW>::skk");
+            resmem_complex_op()(aux2, nij * npw, "ElecState<PW>::aux2");
+            resmem_complex_op()(tbecsum, PARAM.inp.nspin * atom->na * nij, "ElecState<PW>::tbecsum");
             for (int ia = 0; ia < atom->na; ia++)
             {
                 const int iat = ucell->itia2iat(it, ia);
@@ -548,15 +547,15 @@ void ElecStatePW<T, Device>::addusdens_g(const Real* becsum, T** rhog)
                     }
                 }
             }
-            delmem_complex_op()(this->ctx, skk);
-            delmem_complex_op()(this->ctx, aux2);
-            delmem_complex_op()(this->ctx, tbecsum);
+            delmem_complex_op()(skk);
+            delmem_complex_op()(aux2);
+            delmem_complex_op()(tbecsum);
         }
     }
 
-    delmem_var_op()(this->ctx, qmod);
-    delmem_complex_op()(this->ctx, qgm);
-    delmem_var_op()(this->ctx, ylmk0);
+    delmem_var_op()(qmod);
+    delmem_complex_op()(qgm);
+    delmem_var_op()(ylmk0);
 }
 
 template class ElecStatePW<std::complex<float>, base_device::DEVICE_CPU>;
diff --git a/source/module_elecstate/elecstate_pw_cal_tau.cpp b/source/module_elecstate/elecstate_pw_cal_tau.cpp
index ad8c9ce42f..5c225c3d62 100644
--- a/source/module_elecstate/elecstate_pw_cal_tau.cpp
+++ b/source/module_elecstate/elecstate_pw_cal_tau.cpp
@@ -9,7 +9,7 @@ void ElecStatePW<T, Device>::cal_tau(const psi::Psi<T, Device>& psi)
     ModuleBase::TITLE("ElecStatePW", "cal_tau");
     for(int is=0; is<PARAM.inp.nspin; is++)
 	{
-        setmem_var_op()(this->ctx, this->kin_r[is], 0,  this->charge->nrxx);
+        setmem_var_op()(this->kin_r[is], 0,  this->charge->nrxx);
 	}
 
     for (int ik = 0; ik < psi.get_nk(); ++ik)
@@ -31,7 +31,7 @@ void ElecStatePW<T, Device>::cal_tau(const psi::Psi<T, Device>& psi)
             // kinetic energy density
             for (int j = 0; j < 3; j++)
             {
-                setmem_complex_op()(this->ctx, this->wfcr, 0,  this->charge->nrxx);
+                setmem_complex_op()(this->wfcr, 0,  this->charge->nrxx);
 
                 meta_op()(this->ctx,
                             ik,
@@ -52,7 +52,7 @@ void ElecStatePW<T, Device>::cal_tau(const psi::Psi<T, Device>& psi)
     }
     if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") {
         for (int ii = 0; ii < PARAM.inp.nspin; ii++) {
-            castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx);
+            castmem_var_d2h_op()(this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx);
         }
     }
     this->parallelK();
diff --git a/source/module_elecstate/elecstate_pw_sdft.cpp b/source/module_elecstate/elecstate_pw_sdft.cpp
index ad6f98c3c3..bef6277adb 100644
--- a/source/module_elecstate/elecstate_pw_sdft.cpp
+++ b/source/module_elecstate/elecstate_pw_sdft.cpp
@@ -16,7 +16,7 @@ void ElecStatePW_SDFT<T, Device>::psiToRho(const psi::Psi<T, Device>& psi)
     const int nspin = PARAM.inp.nspin;
     for (int is = 0; is < nspin; is++)
     {
-        setmem_var_op()(this->ctx, this->rho[is], 0, this->charge->nrxx);
+        setmem_var_op()(this->rho[is], 0, this->charge->nrxx);
     }
 
     if (GlobalV::MY_STOGROUP == 0)
@@ -28,7 +28,7 @@ void ElecStatePW_SDFT<T, Device>::psiToRho(const psi::Psi<T, Device>& psi)
         }
         if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") {
         for (int ii = 0; ii < nspin; ii++) {
-            castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->rho[ii], this->rho[ii], this->charge->nrxx);
+            castmem_var_d2h_op()(this->charge->rho[ii], this->rho[ii], this->charge->nrxx);
         }
         }
         this->parallelK();
diff --git a/source/module_elecstate/kernels/test/elecstate_op_test.cpp b/source/module_elecstate/kernels/test/elecstate_op_test.cpp
index 79635c7895..973df83cea 100644
--- a/source/module_elecstate/kernels/test/elecstate_op_test.cpp
+++ b/source/module_elecstate/kernels/test/elecstate_op_test.cpp
@@ -107,10 +107,10 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_op_gpu)
     std::vector<double> rho_data(expected_rho.size(), 0);
     double* d_rho_data = NULL;
     std::complex<double>* d_wfcr = NULL;
-    resize_memory_var_op()(gpu_ctx, d_rho_data, rho_data.size());
-    resize_memory_complex_op()(gpu_ctx, d_wfcr, wfcr.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_rho_data, rho_data.data(), rho_data.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_wfcr, wfcr.data(), wfcr.size());
+    resize_memory_var_op()(d_rho_data, rho_data.size());
+    resize_memory_complex_op()(d_wfcr, wfcr.size());
+    syncmem_var_h2d_op()(d_rho_data, rho_data.data(), rho_data.size());
+    syncmem_complex_h2d_op()(d_wfcr, wfcr.data(), wfcr.size());
     double ** rho = new double* [1];
     rho[0] = d_rho_data;
     elecstate_gpu_op()(
@@ -120,7 +120,7 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_op_gpu)
       rho, 
       d_wfcr);
     
-    syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, rho_data.data(), d_rho_data, rho_data.size());
+    syncmem_var_d2h_op()(rho_data.data(), d_rho_data, rho_data.size());
     // check the result 
     for (int ii = 0; ii < rho_data.size(); ii++) {
         EXPECT_LT(fabs(rho_data[ii] - expected_rho[ii]), 6e-5);
@@ -136,12 +136,12 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_spin_op_gpu)
     double* d_rho_data_2 = NULL;
     std::complex<double>* d_wfcr_2 = NULL;
     std::complex<double>* d_wfcr_another_spin_2 = NULL;
-    resize_memory_var_op()(gpu_ctx, d_rho_data_2, rho_data_2.size());
-    resize_memory_complex_op()(gpu_ctx, d_wfcr_2, wfcr_2.size());
-    resize_memory_complex_op()(gpu_ctx, d_wfcr_another_spin_2, wfcr_another_spin_2.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_rho_data_2, rho_data_2.data(), rho_data_2.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_wfcr_2, wfcr_2.data(), wfcr_2.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_wfcr_another_spin_2, wfcr_another_spin_2.data(), wfcr_another_spin_2.size());
+    resize_memory_var_op()(d_rho_data_2, rho_data_2.size());
+    resize_memory_complex_op()(d_wfcr_2, wfcr_2.size());
+    resize_memory_complex_op()(d_wfcr_another_spin_2, wfcr_another_spin_2.size());
+    syncmem_var_h2d_op()(d_rho_data_2, rho_data_2.data(), rho_data_2.size());
+    syncmem_complex_h2d_op()(d_wfcr_2, wfcr_2.data(), wfcr_2.size());
+    syncmem_complex_h2d_op()(d_wfcr_another_spin_2, wfcr_another_spin_2.data(), wfcr_another_spin_2.size());
     double ** rho = new double* [4];
     rho[0] = d_rho_data_2;
     rho[1] = d_rho_data_2 + this->nrxx;
@@ -158,7 +158,7 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_spin_op_gpu)
       d_wfcr_2,
       d_wfcr_another_spin_2);
     
-    syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, rho_data_2.data(), d_rho_data_2, rho_data_2.size());
+    syncmem_var_d2h_op()(rho_data_2.data(), d_rho_data_2, rho_data_2.size());
     // check the result 
     for (int ii = 0; ii < rho_data_2.size(); ii++) {
         EXPECT_LT(fabs(rho_data_2[ii] - expected_rho_2[ii]), 5e-4);
diff --git a/source/module_elecstate/potentials/potential_new.cpp b/source/module_elecstate/potentials/potential_new.cpp
index a4443c46d8..f3d68df05a 100644
--- a/source/module_elecstate/potentials/potential_new.cpp
+++ b/source/module_elecstate/potentials/potential_new.cpp
@@ -50,18 +50,18 @@ Potential::~Potential()
     }
     if (PARAM.inp.basis_type == "pw" && PARAM.inp.device == "gpu") {
         if (PARAM.inp.precision == "single") {
-            delmem_sd_op()(gpu_ctx, s_veff_smooth);
-            delmem_sd_op()(gpu_ctx, s_vofk_smooth);
+            delmem_sd_op()(s_veff_smooth);
+            delmem_sd_op()(s_vofk_smooth);
         }
         else {
-            delmem_dd_op()(gpu_ctx, d_veff_smooth);
-            delmem_dd_op()(gpu_ctx, d_vofk_smooth);
+            delmem_dd_op()(d_veff_smooth);
+            delmem_dd_op()(d_vofk_smooth);
         }
     }
     else {
         if (PARAM.inp.precision == "single") {
-            delmem_sh_op()(cpu_ctx, s_veff_smooth);
-            delmem_sh_op()(cpu_ctx, s_vofk_smooth);
+            delmem_sh_op()(s_veff_smooth);
+            delmem_sh_op()(s_vofk_smooth);
         }
     }
 }
@@ -133,18 +133,18 @@ void Potential::allocate()
     }
     if (PARAM.inp.basis_type == "pw" && PARAM.inp.device == "gpu") {
         if (PARAM.inp.precision == "single") {
-            resmem_sd_op()(gpu_ctx, s_veff_smooth, PARAM.inp.nspin * nrxx_smooth);
-            resmem_sd_op()(gpu_ctx, s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth);
+            resmem_sd_op()(s_veff_smooth, PARAM.inp.nspin * nrxx_smooth);
+            resmem_sd_op()(s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth);
         }
         else {
-            resmem_dd_op()(gpu_ctx, d_veff_smooth, PARAM.inp.nspin * nrxx_smooth);
-            resmem_dd_op()(gpu_ctx, d_vofk_smooth, PARAM.inp.nspin * nrxx_smooth);
+            resmem_dd_op()(d_veff_smooth, PARAM.inp.nspin * nrxx_smooth);
+            resmem_dd_op()(d_vofk_smooth, PARAM.inp.nspin * nrxx_smooth);
         }
     }
     else {
         if (PARAM.inp.precision == "single") {
-            resmem_sh_op()(cpu_ctx, s_veff_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::sveff_smooth");
-            resmem_sh_op()(cpu_ctx, s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::svofk_smooth");
+            resmem_sh_op()(s_veff_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::sveff_smooth");
+            resmem_sh_op()(s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::svofk_smooth");
         }
         else {
             this->d_veff_smooth = this->veff_smooth.c;
@@ -181,40 +181,28 @@ void Potential::update_from_charge(const Charge*const chg, const UnitCell*const
 
     if (PARAM.inp.basis_type == "pw" && PARAM.inp.device == "gpu") {
         if (PARAM.inp.precision == "single") {
-            castmem_d2s_h2d_op()(gpu_ctx,
-                                 cpu_ctx,
-                                 s_veff_smooth,
+            castmem_d2s_h2d_op()(s_veff_smooth,
                                  this->veff_smooth.c,
                                  this->veff_smooth.nr * this->veff_smooth.nc);
-            castmem_d2s_h2d_op()(gpu_ctx,
-                                 cpu_ctx,
-                                 s_vofk_smooth,
+            castmem_d2s_h2d_op()(s_vofk_smooth,
                                  this->vofk_smooth.c,
                                  this->vofk_smooth.nr * this->vofk_smooth.nc);
         }
         else {
-            syncmem_d2d_h2d_op()(gpu_ctx,
-                                 cpu_ctx,
-                                 d_veff_smooth,
+            syncmem_d2d_h2d_op()(d_veff_smooth,
                                  this->veff_smooth.c,
                                  this->veff_smooth.nr * this->veff_smooth.nc);
-            syncmem_d2d_h2d_op()(gpu_ctx,
-                                 cpu_ctx,
-                                 d_vofk_smooth,
+            syncmem_d2d_h2d_op()(d_vofk_smooth,
                                  this->vofk_smooth.c,
                                  this->vofk_smooth.nr * this->vofk_smooth.nc);
         }
     }
     else {
         if (PARAM.inp.precision == "single") {
-            castmem_d2s_h2h_op()(cpu_ctx,
-                                 cpu_ctx,
-                                 s_veff_smooth,
+            castmem_d2s_h2h_op()(s_veff_smooth,
                                  this->veff_smooth.c,
                                  this->veff_smooth.nr * this->veff_smooth.nc);
-            castmem_d2s_h2h_op()(cpu_ctx,
-                                 cpu_ctx,
-                                 s_vofk_smooth,
+            castmem_d2s_h2h_op()(s_vofk_smooth,
                                  this->vofk_smooth.c,
                                  this->vofk_smooth.nr * this->vofk_smooth.nc);
         }
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 84bf0fe8a4..a96d487a5c 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -646,9 +646,7 @@ void ESolver_KS_PW<T, Device>::after_scf(UnitCell& ucell, const int istep)
     // 4) Transfer data from GPU to CPU
     if (this->device == base_device::GpuDevice)
     {
-        castmem_2d_d2h_op()(this->psi[0].get_device(),
-                            this->kspw_psi[0].get_device(),
-                            this->psi[0].get_pointer() - this->psi[0].get_psi_bias(),
+        castmem_2d_d2h_op()(this->psi[0].get_pointer() - this->psi[0].get_psi_bias(),
                             this->kspw_psi[0].get_pointer() - this->kspw_psi[0].get_psi_bias(),
                             this->psi[0].size());
     }
diff --git a/source/module_hamilt_general/hamilt.h b/source/module_hamilt_general/hamilt.h
index 70dcd1b20a..cb204cc298 100644
--- a/source/module_hamilt_general/hamilt.h
+++ b/source/module_hamilt_general/hamilt.h
@@ -39,7 +39,7 @@ class Hamilt
                       const int nbands // number of bands
     ) const
     {
-        syncmem_op()(this->ctx, this->ctx, spsi, psi_in, static_cast<size_t>(nbands * nrow));
+        syncmem_op()(spsi, psi_in, static_cast<size_t>(nbands * nrow));
     }
 
 	/// core function: return H(k) and S(k) matrixs for direct solving eigenvalues.
diff --git a/source/module_hamilt_general/operator.cpp b/source/module_hamilt_general/operator.cpp
index 008d5e30e3..e9020866e6 100644
--- a/source/module_hamilt_general/operator.cpp
+++ b/source/module_hamilt_general/operator.cpp
@@ -59,7 +59,7 @@ typename Operator<T, Device>::hpsi_info Operator<T, Device>::hPsi(hpsi_info& inp
     if (this->in_place)
     {
         // ModuleBase::GlobalFunc::COPYARRAY(this->hpsi->get_pointer(), hpsi_pointer, this->hpsi->size());
-        syncmem_op()(this->ctx, this->ctx, hpsi_pointer, this->hpsi->get_pointer(), this->hpsi->size());
+        syncmem_op()(hpsi_pointer, this->hpsi->get_pointer(), this->hpsi->size());
         delete this->hpsi;
         this->hpsi = new psi::Psi<T, Device>(hpsi_pointer, 
                                              1, 
diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp
index 87a2fa41cc..36baed7bab 100644
--- a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp
@@ -27,8 +27,8 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
 #if ((defined __CUDA) || (defined __ROCM))
         base_device::DEVICE_GPU* ctx = {};
         base_device::DEVICE_CPU* cpu_ctx = {};
-        base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx, becp_cpu, size_ps);
-        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, becp_cpu, becp_k, size_ps);   
+        base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(becp_cpu, size_ps);
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(becp_cpu, becp_k, size_ps);   
 #endif
     }
     else if (PARAM.inp.device == "cpu")
@@ -68,8 +68,8 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
 #if ((defined __CUDA) || (defined __ROCM))
         base_device::DEVICE_GPU* ctx = {};
         base_device::DEVICE_CPU* cpu_ctx = {};
-        base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, ps_pointer, size_ps);
-        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ctx, cpu_ctx, ps_pointer, ps.data(), size_ps);   
+        base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ps_pointer, size_ps);
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ps_pointer, ps.data(), size_ps);   
 #endif
     }
     else if (PARAM.inp.device == "cpu")
@@ -100,7 +100,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
             h_tmp,
             nbands
         );
-        base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, ps_pointer);
+        base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ps_pointer);
         delete[] becp_cpu;
 #endif
 
@@ -260,20 +260,20 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(int
                 becp_tmp.resize(size_becp * nk);
                 std::complex<double>* h_tmp = nullptr;
                 std::complex<double>* s_tmp = nullptr;
-                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands);
-                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands);
+                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(h_tmp, nbands * nbands);
+                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(s_tmp, nbands * nbands);
                 int initial_hs = 0;
                 if(this->sub_h_save == nullptr)
                 {
                     initial_hs = 1;
                     
-                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, this->sub_h_save, nbands * nbands * nk);
-                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, this->sub_s_save, nbands * nbands * nk);
-                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, this->becp_save, size_becp * nk);
+                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(this->sub_h_save, nbands * nbands * nk);
+                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(this->sub_s_save, nbands * nbands * nk);
+                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(this->becp_save, size_becp * nk);
                 }
                 std::complex<double>* becp_pointer = nullptr;
                 // allocate memory for becp_pointer in GPU device
-                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp);
+                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(becp_pointer, size_becp);
                 for (int ik = 0; ik < nk; ++ik)
                 {
                     psi_t->fix_k(ik);
@@ -286,10 +286,10 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(int
                         /// update H(k) for each k point
                         hamilt_t->updateHk(ik);
                         hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k);
-                        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, becp_k, onsite_p->get_becp(), size_becp);
+                        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(becp_k, onsite_p->get_becp(), size_becp);
                     }
-                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands);
-                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands);
+                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(h_tmp, h_k, nbands * nbands);
+                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(s_tmp, s_k, nbands * nbands);
                     // update h_tmp by delta_lambda
                     if (i_step != -1) this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat);
 
@@ -301,11 +301,11 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(int
                                                                                   nkb * npol,
                                                                                   &this->pelec->ekb(ik, 0));
                     // copy becp_pointer from GPU to CPU
-                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp);   
+                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(&becp_tmp[ik * size_becp], becp_pointer, size_becp);   
                 }
 
                 // free memory for becp_pointer in GPU device
-                base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer);
+                base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(becp_pointer);
             }
 #endif
             // calculate weights from ekb to update wg
@@ -462,8 +462,8 @@ void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const
 
             std::complex<double>* h_tmp = nullptr;
             std::complex<double>* s_tmp = nullptr;
-            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands);
-            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands);
+            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(h_tmp, nbands * nbands);
+            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(s_tmp, nbands * nbands);
             assert(this->sub_h_save != nullptr);
             assert(this->sub_s_save != nullptr);
             assert(this->becp_save != nullptr);
@@ -474,8 +474,8 @@ void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const
                 std::complex<double>* becp_k = this->becp_save + ik * size_becp;
 
                 psi_t->fix_k(ik);
-                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands);
-                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands);
+                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(h_tmp, h_k, nbands * nbands);
+                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(s_tmp, s_k, nbands * nbands);
                 this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat);
                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_subspace_psi(h_tmp,
                                                                                 s_tmp,
@@ -484,9 +484,9 @@ void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const
                                                                                 &this->pelec->ekb(ik, 0));
             }
 
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, sub_h_save);
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, sub_s_save);
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_save);
+            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(sub_h_save);
+            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(sub_s_save);
+            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(becp_save);
             this->sub_h_save = nullptr;
             this->sub_s_save = nullptr;
             this->becp_save = nullptr;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp
index b41c8f476e..bcd0cba74d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp
@@ -36,42 +36,42 @@ void pseudopot_cell_vnl::release_memory()
     {
         if (PARAM.inp.precision == "single")
         {
-            delmem_sd_op()(gpu_ctx, this->s_deeq);
-            delmem_sd_op()(gpu_ctx, this->s_nhtol);
-            delmem_sd_op()(gpu_ctx, this->s_nhtolm);
-            delmem_sd_op()(gpu_ctx, this->s_indv);
-            delmem_sd_op()(gpu_ctx, this->s_tab);
-            delmem_sd_op()(gpu_ctx, this->s_qq_nt);
-            delmem_cd_op()(gpu_ctx, this->c_deeq_nc);
-            delmem_cd_op()(gpu_ctx, this->c_vkb);
-            delmem_cd_op()(gpu_ctx, this->c_qq_so);
+            delmem_sd_op()(this->s_deeq);
+            delmem_sd_op()(this->s_nhtol);
+            delmem_sd_op()(this->s_nhtolm);
+            delmem_sd_op()(this->s_indv);
+            delmem_sd_op()(this->s_tab);
+            delmem_sd_op()(this->s_qq_nt);
+            delmem_cd_op()(this->c_deeq_nc);
+            delmem_cd_op()(this->c_vkb);
+            delmem_cd_op()(this->c_qq_so);
         }
         else
         {
-            delmem_zd_op()(gpu_ctx, this->z_deeq_nc);
-            delmem_zd_op()(gpu_ctx, this->z_qq_so);
+            delmem_zd_op()(this->z_deeq_nc);
+            delmem_zd_op()(this->z_qq_so);
         }
-        delmem_dd_op()(gpu_ctx, this->d_deeq);
-        delmem_zd_op()(gpu_ctx, this->z_vkb);
-        delmem_dd_op()(gpu_ctx, this->d_tab);
-        delmem_dd_op()(gpu_ctx, this->d_indv);
-        delmem_dd_op()(gpu_ctx, this->d_nhtol);
-        delmem_dd_op()(gpu_ctx, this->d_nhtolm);
-        delmem_dd_op()(gpu_ctx, this->d_qq_nt);
+        delmem_dd_op()(this->d_deeq);
+        delmem_zd_op()(this->z_vkb);
+        delmem_dd_op()(this->d_tab);
+        delmem_dd_op()(this->d_indv);
+        delmem_dd_op()(this->d_nhtol);
+        delmem_dd_op()(this->d_nhtolm);
+        delmem_dd_op()(this->d_qq_nt);
     }
     else
     {
         if (PARAM.inp.precision == "single")
         {
-            delmem_sh_op()(cpu_ctx, this->s_deeq);
-            delmem_sh_op()(cpu_ctx, this->s_nhtol);
-            delmem_sh_op()(cpu_ctx, this->s_nhtolm);
-            delmem_sh_op()(cpu_ctx, this->s_indv);
-            delmem_sh_op()(cpu_ctx, this->s_tab);
-            delmem_sh_op()(cpu_ctx, this->s_qq_nt);
-            delmem_ch_op()(cpu_ctx, this->c_deeq_nc);
-            delmem_ch_op()(cpu_ctx, this->c_vkb);
-            delmem_ch_op()(cpu_ctx, this->c_qq_so);
+            delmem_sh_op()(this->s_deeq);
+            delmem_sh_op()(this->s_nhtol);
+            delmem_sh_op()(this->s_nhtolm);
+            delmem_sh_op()(this->s_indv);
+            delmem_sh_op()(this->s_tab);
+            delmem_sh_op()(this->s_qq_nt);
+            delmem_ch_op()(this->c_deeq_nc);
+            delmem_ch_op()(this->c_vkb);
+            delmem_ch_op()(this->c_qq_so);
         }
         // There's no need to delete double precision pointers while in a CPU environment.
     }
@@ -158,42 +158,40 @@ void pseudopot_cell_vnl::init(const UnitCell& ucell,
         {
             if (PARAM.inp.precision == "single")
             {
-                resmem_sd_op()(gpu_ctx, s_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
-                resmem_sd_op()(gpu_ctx, s_nhtol, ntype * this->nhm);
-                resmem_sd_op()(gpu_ctx, s_nhtolm, ntype * this->nhm);
-                resmem_sd_op()(gpu_ctx, s_indv, ntype * this->nhm);
-                resmem_sd_op()(gpu_ctx, s_qq_nt, ntype * this->nhm * this->nhm);
-                resmem_cd_op()(gpu_ctx, c_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
-                resmem_cd_op()(gpu_ctx, c_qq_so, ntype * 4 * this->nhm * this->nhm);
+                resmem_sd_op()(s_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
+                resmem_sd_op()(s_nhtol, ntype * this->nhm);
+                resmem_sd_op()(s_nhtolm, ntype * this->nhm);
+                resmem_sd_op()(s_indv, ntype * this->nhm);
+                resmem_sd_op()(s_qq_nt, ntype * this->nhm * this->nhm);
+                resmem_cd_op()(c_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
+                resmem_cd_op()(c_qq_so, ntype * 4 * this->nhm * this->nhm);
             }
             else
             {
-                resmem_zd_op()(gpu_ctx, z_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
-                resmem_zd_op()(gpu_ctx, z_qq_so, ntype * 4 * this->nhm * this->nhm);
+                resmem_zd_op()(z_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
+                resmem_zd_op()(z_qq_so, ntype * 4 * this->nhm * this->nhm);
             }
-            resmem_dd_op()(gpu_ctx, d_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
-            resmem_dd_op()(gpu_ctx, d_indv, ntype * this->nhm);
-            resmem_dd_op()(gpu_ctx, d_nhtol, ntype * this->nhm);
-            resmem_dd_op()(gpu_ctx, d_nhtolm, ntype * this->nhm);
-            resmem_dd_op()(gpu_ctx, d_qq_nt, ntype * this->nhm * this->nhm);
+            resmem_dd_op()(d_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm);
+            resmem_dd_op()(d_indv, ntype * this->nhm);
+            resmem_dd_op()(d_nhtol, ntype * this->nhm);
+            resmem_dd_op()(d_nhtolm, ntype * this->nhm);
+            resmem_dd_op()(d_qq_nt, ntype * this->nhm * this->nhm);
         }
         else
         {
             if (PARAM.inp.precision == "single")
             {
-                resmem_sh_op()(cpu_ctx,
-                               s_deeq,
+                resmem_sh_op()(s_deeq,
                                PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm,
                                "VNL::s_deeq");
-                resmem_sh_op()(cpu_ctx, s_nhtol, ntype * this->nhm, "VNL::s_nhtol");
-                resmem_sh_op()(cpu_ctx, s_nhtolm, ntype * this->nhm, "VNL::s_nhtolm");
-                resmem_sh_op()(cpu_ctx, s_indv, ntype * this->nhm, "VNL::s_indv");
-                resmem_sh_op()(cpu_ctx, s_qq_nt, ntype * this->nhm * this->nhm, "VNL::s_qq_nt");
-                resmem_ch_op()(cpu_ctx,
-                               c_deeq_nc,
+                resmem_sh_op()(s_nhtol, ntype * this->nhm, "VNL::s_nhtol");
+                resmem_sh_op()(s_nhtolm, ntype * this->nhm, "VNL::s_nhtolm");
+                resmem_sh_op()(s_indv, ntype * this->nhm, "VNL::s_indv");
+                resmem_sh_op()(s_qq_nt, ntype * this->nhm * this->nhm, "VNL::s_qq_nt");
+                resmem_ch_op()(c_deeq_nc,
                                PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm,
                                "VNL::c_deeq_nc");
-                resmem_ch_op()(cpu_ctx, c_qq_so, ntype * 4 * this->nhm * this->nhm, "VNL::c_qq_so");
+                resmem_ch_op()(c_qq_so, ntype * 4 * this->nhm * this->nhm, "VNL::c_qq_so");
             }
             else
             {
@@ -275,18 +273,18 @@ void pseudopot_cell_vnl::init(const UnitCell& ucell,
     {
         if (PARAM.inp.precision == "single")
         {
-            resmem_sd_op()(gpu_ctx, s_tab, this->tab.getSize());
-            resmem_cd_op()(gpu_ctx, c_vkb, nkb * npwx);
+            resmem_sd_op()(s_tab, this->tab.getSize());
+            resmem_cd_op()(c_vkb, nkb * npwx);
         }
-        resmem_zd_op()(gpu_ctx, z_vkb, nkb * npwx);
-        resmem_dd_op()(gpu_ctx, d_tab, this->tab.getSize());
+        resmem_zd_op()(z_vkb, nkb * npwx);
+        resmem_dd_op()(d_tab, this->tab.getSize());
     }
     else
     {
         if (PARAM.inp.precision == "single")
         {
-            resmem_sh_op()(cpu_ctx, s_tab, this->tab.getSize());
-            resmem_ch_op()(cpu_ctx, c_vkb, nkb * npwx);
+            resmem_sh_op()(s_tab, this->tab.getSize());
+            resmem_ch_op()(c_vkb, nkb * npwx);
         }
         this->z_vkb = this->vkb.c;
         this->d_tab = this->tab.ptr;
@@ -339,7 +337,7 @@ void pseudopot_cell_vnl::getvnl(const int& ik, const UnitCell& ucell, ModuleBase
     using resmem_complex_op = base_device::memory::resize_memory_op<std::complex<double>, Device>;
     using delmem_complex_op = base_device::memory::delete_memory_op<std::complex<double>, Device>;
     std::complex<double>* sk = nullptr;
-    resmem_complex_op()(ctx, sk, ucell.nat * npw, "VNL::sk");
+    resmem_complex_op()(sk, ucell.nat * npw, "VNL::sk");
     this->psf->get_sk(ctx, ik, this->wfcpw, sk);
 
     int jkb = 0, iat = 0;
@@ -404,7 +402,7 @@ void pseudopot_cell_vnl::getvnl(const int& ik, const UnitCell& ucell, ModuleBase
 
     delete[] gk;
     delete[] vq;
-    delmem_complex_op()(ctx, sk);
+    delmem_complex_op()(sk);
     ModuleBase::timer::tick("pp_cell_vnl", "getvnl");
 
     return;
@@ -457,8 +455,8 @@ void pseudopot_cell_vnl::getvnl(Device* ctx,
     FPTYPE *vkb1 = nullptr, *gk = nullptr, *ylm = nullptr, *_tab = this->get_tab_data<FPTYPE>(),
            *_indv = this->get_indv_data<FPTYPE>(), *_nhtol = this->get_nhtol_data<FPTYPE>(),
            *_nhtolm = this->get_nhtolm_data<FPTYPE>();
-    resmem_var_op()(ctx, ylm, x1 * npw, "VNL::ylm");
-    resmem_var_op()(ctx, vkb1, nhm * npw, "VNL::vkb1");
+    resmem_var_op()(ylm, x1 * npw, "VNL::ylm");
+    resmem_var_op()(vkb1, nhm * npw, "VNL::vkb1");
 
     ModuleBase::Vector3<double>* _gk = new ModuleBase::Vector3<double>[npw];
 #ifdef _OPENMP
@@ -470,15 +468,15 @@ void pseudopot_cell_vnl::getvnl(Device* ctx,
     }
     if (PARAM.inp.device == "gpu")
     {
-        resmem_int_op()(ctx, atom_nh, ucell.ntype);
-        resmem_int_op()(ctx, atom_nb, ucell.ntype);
-        resmem_int_op()(ctx, atom_na, ucell.ntype);
-        syncmem_int_op()(ctx, cpu_ctx, atom_nh, h_atom_nh, ucell.ntype);
-        syncmem_int_op()(ctx, cpu_ctx, atom_nb, h_atom_nb, ucell.ntype);
-        syncmem_int_op()(ctx, cpu_ctx, atom_na, h_atom_na, ucell.ntype);
-
-        resmem_var_op()(ctx, gk, npw * 3);
-        castmem_var_h2d_op()(ctx, cpu_ctx, gk, reinterpret_cast<double*>(_gk), npw * 3);
+        resmem_int_op()(atom_nh, ucell.ntype);
+        resmem_int_op()(atom_nb, ucell.ntype);
+        resmem_int_op()(atom_na, ucell.ntype);
+        syncmem_int_op()(atom_nh, h_atom_nh, ucell.ntype);
+        syncmem_int_op()(atom_nb, h_atom_nb, ucell.ntype);
+        syncmem_int_op()(atom_na, h_atom_na, ucell.ntype);
+
+        resmem_var_op()(gk, npw * 3);
+        castmem_var_h2d_op()(gk, reinterpret_cast<double*>(_gk), npw * 3);
     }
     else
     {
@@ -487,8 +485,8 @@ void pseudopot_cell_vnl::getvnl(Device* ctx,
         atom_na = h_atom_na;
         if (PARAM.inp.precision == "single")
         {
-            resmem_var_op()(ctx, gk, npw * 3);
-            castmem_var_h2h_op()(cpu_ctx, cpu_ctx, gk, reinterpret_cast<double*>(_gk), npw * 3);
+            resmem_var_op()(gk, npw * 3);
+            castmem_var_h2h_op()(gk, reinterpret_cast<double*>(_gk), npw * 3);
         }
         else
         {
@@ -499,7 +497,7 @@ void pseudopot_cell_vnl::getvnl(Device* ctx,
     ModuleBase::YlmReal::Ylm_Real(ctx, x1, npw, gk, ylm);
 
     std::complex<FPTYPE>* sk = nullptr;
-    resmem_complex_op()(ctx, sk, ucell.nat * npw);
+    resmem_complex_op()(sk, ucell.nat * npw);
     this->psf->get_sk(ctx, ik, this->wfcpw, sk);
 
     cal_vnl_op()(ctx,
@@ -529,18 +527,18 @@ void pseudopot_cell_vnl::getvnl(Device* ctx,
     delete[] h_atom_nh;
     delete[] h_atom_na;
     delete[] h_atom_nb;
-    delmem_var_op()(ctx, ylm);
-    delmem_var_op()(ctx, vkb1);
-    delmem_complex_op()(ctx, sk);
+    delmem_var_op()(ylm);
+    delmem_var_op()(vkb1);
+    delmem_complex_op()(sk);
     if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single")
     {
-        delmem_var_op()(ctx, gk);
+        delmem_var_op()(gk);
     }
     if (PARAM.inp.device == "gpu")
     {
-        delmem_int_op()(ctx, atom_nh);
-        delmem_int_op()(ctx, atom_nb);
-        delmem_int_op()(ctx, atom_na);
+        delmem_int_op()(atom_nh);
+        delmem_int_op()(atom_nb);
+        delmem_int_op()(atom_na);
     }
     ModuleBase::timer::tick("pp_cell_vnl", "getvnl");
 } // end subroutine getvnl
@@ -874,36 +872,36 @@ void pseudopot_cell_vnl::init_vnl(UnitCell& cell, const ModulePW::PW_Basis* rho_
     {
         if (PARAM.inp.precision == "single")
         {
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_indv, this->indv.c, this->indv.nr * this->indv.nc);
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc);
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc);
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_tab, this->tab.ptr, this->tab.getSize());
-            castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize());
-            castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize());
+            castmem_d2s_h2d_op()(this->s_indv, this->indv.c, this->indv.nr * this->indv.nc);
+            castmem_d2s_h2d_op()(this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc);
+            castmem_d2s_h2d_op()(this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc);
+            castmem_d2s_h2d_op()(this->s_tab, this->tab.ptr, this->tab.getSize());
+            castmem_d2s_h2d_op()(this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize());
+            castmem_z2c_h2d_op()(this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize());
         }
         else
         {
-            syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_qq_so, this->qq_so.ptr, this->qq_so.getSize());
+            syncmem_z2z_h2d_op()(this->z_qq_so, this->qq_so.ptr, this->qq_so.getSize());
         }
         // Even when the single precision flag is enabled,
         // these variables are utilized in the Force/Stress calculation as well.
         // modified by denghuilu at 2023-05-15
-        syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_indv, this->indv.c, this->indv.nr * this->indv.nc);
-        syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc);
-        syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc);
-        syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_tab, this->tab.ptr, this->tab.getSize());
-        syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize());
+        syncmem_d2d_h2d_op()(this->d_indv, this->indv.c, this->indv.nr * this->indv.nc);
+        syncmem_d2d_h2d_op()(this->d_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc);
+        syncmem_d2d_h2d_op()(this->d_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc);
+        syncmem_d2d_h2d_op()(this->d_tab, this->tab.ptr, this->tab.getSize());
+        syncmem_d2d_h2d_op()(this->d_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize());
     }
     else
     {
         if (PARAM.inp.precision == "single")
         {
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_indv, this->indv.c, this->indv.nr * this->indv.nc);
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc);
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc);
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_tab, this->tab.ptr, this->tab.getSize());
-            castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize());
-            castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize());
+            castmem_d2s_h2h_op()(this->s_indv, this->indv.c, this->indv.nr * this->indv.nc);
+            castmem_d2s_h2h_op()(this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc);
+            castmem_d2s_h2h_op()(this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc);
+            castmem_d2s_h2h_op()(this->s_tab, this->tab.ptr, this->tab.getSize());
+            castmem_d2s_h2h_op()(this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize());
+            castmem_z2c_h2h_op()(this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize());
         }
         // There's no need to synchronize double precision pointers while in a CPU environment.
     }
@@ -1082,7 +1080,7 @@ void pseudopot_cell_vnl::radial_fft_q(Device* ctx,
     const int ivl = nhtolm(itype, ih);
     const int jvl = nhtolm(itype, jh);
 
-    setmem_complex_op()(ctx, qg, 0, ng);
+    setmem_complex_op()(qg, 0, ng);
 
     const double* qnorm_double = reinterpret_cast<const double*>(qnorm);
 
@@ -1492,28 +1490,20 @@ void pseudopot_cell_vnl::cal_effective_D(const ModuleBase::matrix& veff,
     {
         if (PARAM.inp.precision == "single")
         {
-            castmem_d2s_h2d_op()(gpu_ctx,
-                                 cpu_ctx,
-                                 this->s_deeq,
+            castmem_d2s_h2d_op()(this->s_deeq,
                                  this->deeq.ptr,
                                  PARAM.inp.nspin * cell.nat * this->nhm * this->nhm);
-            castmem_z2c_h2d_op()(gpu_ctx,
-                                 cpu_ctx,
-                                 this->c_deeq_nc,
+            castmem_z2c_h2d_op()(this->c_deeq_nc,
                                  this->deeq_nc.ptr,
                                  PARAM.inp.nspin * cell.nat * this->nhm * this->nhm);
         }
         else
         {
-            syncmem_z2z_h2d_op()(gpu_ctx,
-                                 cpu_ctx,
-                                 this->z_deeq_nc,
+            syncmem_z2z_h2d_op()(this->z_deeq_nc,
                                  this->deeq_nc.ptr,
                                  PARAM.inp.nspin * cell.nat * this->nhm * this->nhm);
         }
-        syncmem_d2d_h2d_op()(gpu_ctx,
-                             cpu_ctx,
-                             this->d_deeq,
+        syncmem_d2d_h2d_op()(this->d_deeq,
                              this->deeq.ptr,
                              PARAM.inp.nspin * cell.nat * this->nhm * this->nhm);
     }
@@ -1521,14 +1511,10 @@ void pseudopot_cell_vnl::cal_effective_D(const ModuleBase::matrix& veff,
     {
         if (PARAM.inp.precision == "single")
         {
-            castmem_d2s_h2h_op()(cpu_ctx,
-                                 cpu_ctx,
-                                 this->s_deeq,
+            castmem_d2s_h2h_op()(this->s_deeq,
                                  this->deeq.ptr,
                                  PARAM.inp.nspin * cell.nat * this->nhm * this->nhm);
-            castmem_z2c_h2h_op()(cpu_ctx,
-                                 cpu_ctx,
-                                 this->c_deeq_nc,
+            castmem_z2c_h2h_op()(this->c_deeq_nc,
                                  this->deeq_nc.ptr,
                                  PARAM.inp.nspin * cell.nat * this->nhm * this->nhm);
         }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
index 3346724deb..41184b11d0 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp
@@ -134,17 +134,17 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
     }
 
 	if(this->device == base_device::GpuDevice ) {
-		resmem_var_op()(this->ctx, gv_x_d, rho_basis->npw);
-        resmem_var_op()(this->ctx, gv_y_d, rho_basis->npw);
-        resmem_var_op()(this->ctx, gv_z_d, rho_basis->npw);
-        resmem_var_op()(this->ctx, rhocgigg_vec_d, rho_basis->npw);
-        resmem_complex_op()(this->ctx, psiv_d, rho_basis->nmaxgr);
-        resmem_var_op()(this->ctx, force_d, 3);
-
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_x_d, gv_x.data(), rho_basis->npw);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_y_d, gv_y.data(), rho_basis->npw);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_z_d, gv_z.data(), rho_basis->npw);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, psiv_d, psiv, rho_basis->nmaxgr);
+		resmem_var_op()(gv_x_d, rho_basis->npw);
+        resmem_var_op()(gv_y_d, rho_basis->npw);
+        resmem_var_op()(gv_z_d, rho_basis->npw);
+        resmem_var_op()(rhocgigg_vec_d, rho_basis->npw);
+        resmem_complex_op()(psiv_d, rho_basis->nmaxgr);
+        resmem_var_op()(force_d, 3);
+
+		syncmem_var_h2d_op()(gv_x_d, gv_x.data(), rho_basis->npw);
+        syncmem_var_h2d_op()(gv_y_d, gv_y.data(), rho_basis->npw);
+        syncmem_var_h2d_op()(gv_z_d, gv_z.data(), rho_basis->npw);
+        syncmem_complex_h2d_op()(psiv_d, psiv, rho_basis->nmaxgr);
 	}
 
 
@@ -178,7 +178,7 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
             }
 
             if(this->device == base_device::GpuDevice ) {
-                syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw);
+                syncmem_var_h2d_op()(rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw);
             }
             for (int ia = 0; ia < ucell_in.atoms[it].na; ++ia)
             {
@@ -188,12 +188,12 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
                 double force[3] = {0, 0, 0};
 
                 if(this->device == base_device::GpuDevice ) {
-                    syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, force_d, force, 3);
+                    syncmem_var_h2d_op()(force_d, force, 3);
                     hamilt::cal_force_npw_op<FPTYPE, Device>()(
                         psiv_d, gv_x_d, gv_y_d, gv_z_d, rhocgigg_vec_d, force_d, pos.x, pos.y, pos.z, 
                         rho_basis->npw, ucell_in.omega, ucell_in.tpiba
                     );      
-                    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force, force_d, 3);	          
+                    syncmem_var_d2h_op()(force, force_d, 3);	          
                 
                 } else {
                     hamilt::cal_force_npw_op<FPTYPE, Device>()(
@@ -213,12 +213,12 @@ void Forces<FPTYPE, Device>::cal_force_cc(ModuleBase::matrix& forcecc,
     }
     if (this->device == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, gv_x_d);
-        delmem_var_op()(this->ctx, gv_y_d);
-        delmem_var_op()(this->ctx, gv_z_d);
-        delmem_var_op()(this->ctx, force_d);
-        delmem_var_op()(this->ctx, rhocgigg_vec_d);
-        delmem_complex_op()(this->ctx, psiv_d);
+        delmem_var_op()(gv_x_d);
+        delmem_var_op()(gv_y_d);
+        delmem_var_op()(gv_z_d);
+        delmem_var_op()(force_d);
+        delmem_var_op()(rhocgigg_vec_d);
+        delmem_complex_op()(psiv_d);
     }
     delete[] rhocg;
 
@@ -308,24 +308,24 @@ void Forces<FPTYPE, Device>::deriv_drhoc
     double *aux_d = nullptr;
     double *drhocg_d = nullptr;
 	if(this->device == base_device::GpuDevice ) {
-		resmem_var_op()(this->ctx, r_d, mesh);
-		resmem_var_op()(this->ctx, rhoc_d, mesh);
-		resmem_var_op()(this->ctx, rab_d, mesh);
-
-		resmem_var_op()(this->ctx, aux_d, mesh);
-		resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg);
-		resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg);
-
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gx_arr_d, gx_arr.data(), rho_basis->ngg);
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, mesh);
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, mesh);
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, rhoc, mesh);
+		resmem_var_op()(r_d, mesh);
+		resmem_var_op()(rhoc_d, mesh);
+		resmem_var_op()(rab_d, mesh);
+
+		resmem_var_op()(aux_d, mesh);
+		resmem_var_op()(gx_arr_d, rho_basis->ngg);
+		resmem_var_op()(drhocg_d, rho_basis->ngg);
+
+		syncmem_var_h2d_op()(gx_arr_d, gx_arr.data(), rho_basis->ngg);
+		syncmem_var_h2d_op()(r_d, r, mesh);
+		syncmem_var_h2d_op()(rab_d, rab, mesh);
+		syncmem_var_h2d_op()(rhoc_d, rhoc, mesh);
 	}
 
 	if(this->device == base_device::GpuDevice) {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
 			r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type);
-		syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
+		syncmem_var_d2h_op()(drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
 
 
 
@@ -334,11 +334,11 @@ void Forces<FPTYPE, Device>::deriv_drhoc
 			r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type);
     }
 
-    delmem_var_op()(this->ctx, r_d);
-    delmem_var_op()(this->ctx, rhoc_d);
-    delmem_var_op()(this->ctx, rab_d);
-    delmem_var_op()(this->ctx, gx_arr_d);
-    delmem_var_op()(this->ctx, drhocg_d);
+    delmem_var_op()(r_d);
+    delmem_var_op()(rhoc_d);
+    delmem_var_op()(rab_d);
+    delmem_var_op()(gx_arr_d);
+    delmem_var_op()(drhocg_d);
     return;
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
index 8ecba030f3..bd615f0eef 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp
@@ -27,8 +27,8 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
 
     // allocate memory for the force
     FPTYPE* force = nullptr;
-    resmem_var_op()(this->ctx, force, ucell_in.nat * 3);
-    base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, force, 0.0, ucell_in.nat * 3);
+    resmem_var_op()(force, ucell_in.nat * 3);
+    base_device::memory::set_memory_op<FPTYPE, Device>()(force, 0.0, ucell_in.nat * 3);
 
     hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(&nlpp, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb);
 
@@ -62,8 +62,8 @@ void Forces<FPTYPE, Device>::cal_force_nl(ModuleBase::matrix& forcenl,
         nl_tools.cal_force(ik, max_nbands, npm, true, force);
     } // end ik
 
-    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc);
-    delmem_var_op()(this->ctx, force);
+    syncmem_var_d2h_op()(forcenl.c, force, forcenl.nr * forcenl.nc);
+    delmem_var_op()(force);
     // sum up forcenl from all processors
     Parallel_Reduce::reduce_all(forcenl.c, forcenl.nr * forcenl.nc);
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
index 240187b3ba..36f90f0001 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
@@ -23,8 +23,8 @@ void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
 
     // allocate memory for the force
     FPTYPE* force = nullptr;
-    resmem_var_op()(this->ctx, force, ucell_in.nat * 3);
-    base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, force, 0.0, ucell_in.nat * 3);
+    resmem_var_op()(force, ucell_in.nat * 3);
+    base_device::memory::set_memory_op<FPTYPE, Device>()(force, 0.0, ucell_in.nat * 3);
 
     auto* onsite_p = projectors::OnsiteProjector<FPTYPE, Device>::get_instance();
 
@@ -65,8 +65,8 @@ void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
         
     } // end ik
 
-    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_onsite.c, force, force_onsite.nr * force_onsite.nc);
-    delmem_var_op()(this->ctx, force);
+    syncmem_var_d2h_op()(force_onsite.c, force, force_onsite.nr * force_onsite.nc);
+    delmem_var_op()(force);
     // sum up force_onsite from all processors
     Parallel_Reduce::reduce_all(force_onsite.c, force_onsite.nr * force_onsite.nc);
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
index f670ad9b27..ab63f43aff 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp
@@ -190,28 +190,26 @@ void Forces<FPTYPE, Device>::deriv_drhoc_scc(const bool& numeric,
     double *aux_d = nullptr;
     double *drhocg_d = nullptr;
     if (this->device == base_device::GpuDevice) {
-        resmem_var_op()(this->ctx, r_d, mesh);
-        resmem_var_op()(this->ctx, rhoc_d, mesh);
-        resmem_var_op()(this->ctx, rab_d, mesh);
+        resmem_var_op()(r_d, mesh);
+        resmem_var_op()(rhoc_d, mesh);
+        resmem_var_op()(rab_d, mesh);
 
-        resmem_var_op()(this->ctx, aux_d, mesh);
-        resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg);
-        resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg);
+        resmem_var_op()(aux_d, mesh);
+        resmem_var_op()(gx_arr_d, rho_basis->ngg);
+        resmem_var_op()(drhocg_d, rho_basis->ngg);
 
-        syncmem_var_h2d_op()(this->ctx,
-                             this->cpu_ctx,
-                             gx_arr_d,
+        syncmem_var_h2d_op()(gx_arr_d,
                              gx_arr.data(),
                              rho_basis->ngg);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, mesh);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, mesh);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, rhoc, mesh);
+        syncmem_var_h2d_op()(r_d, r, mesh);
+        syncmem_var_h2d_op()(rab_d, rab, mesh);
+        syncmem_var_h2d_op()(rhoc_d, rhoc, mesh);
     }
 
 	if(this->device == base_device::GpuDevice) {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
 			r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,2);
-		syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
+		syncmem_var_d2h_op()(drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
 
 	} else {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
@@ -219,11 +217,11 @@ void Forces<FPTYPE, Device>::deriv_drhoc_scc(const bool& numeric,
 
 	}
 
-    delmem_var_op()(this->ctx, r_d);
-    delmem_var_op()(this->ctx, rhoc_d);
-    delmem_var_op()(this->ctx, rab_d);
-    delmem_var_op()(this->ctx, gx_arr_d);
-    delmem_var_op()(this->ctx, drhocg_d);
+    delmem_var_op()(r_d);
+    delmem_var_op()(rhoc_d);
+    delmem_var_op()(rab_d);
+    delmem_var_op()(gx_arr_d);
+    delmem_var_op()(drhocg_d);
     return;
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp
index 89efb3f879..00049866f9 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp
@@ -27,8 +27,8 @@ FS_Kin_tools<FPTYPE, Device>::FS_Kin_tools(const UnitCell& ucell_in,
 
     if (this->device == base_device::GpuDevice)
     {
-        resmem_var_op()(this->ctx, d_gk, 3 * npwk_max);
-        resmem_var_op()(this->ctx, d_kfac, npwk_max);
+        resmem_var_op()(d_gk, 3 * npwk_max);
+        resmem_var_op()(d_kfac, npwk_max);
     }
     else
     {
@@ -42,8 +42,8 @@ FS_Kin_tools<FPTYPE, Device>::~FS_Kin_tools()
 {
     if (this->device == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, d_gk);
-        delmem_var_op()(this->ctx, d_kfac);
+        delmem_var_op()(d_gk);
+        delmem_var_op()(d_kfac);
     }
 }
 
@@ -72,8 +72,8 @@ void FS_Kin_tools<FPTYPE, Device>::cal_gk(const int& ik)
     }
     if (this->device == base_device::GpuDevice)
     {
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gk, gk[0], 3 * npwk_max);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_kfac, kfac.data(), npwk_max);
+        syncmem_var_h2d_op()(d_gk, gk[0], 3 * npwk_max);
+        syncmem_var_h2d_op()(d_kfac, kfac.data(), npwk_max);
     }
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index 810b313292..523cb2b504 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -73,36 +73,36 @@ void FS_Nonlocal_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
     // allocate the memory for vkb and vkb_deri.
     if (this->device == base_device::GpuDevice)
     {
-        resmem_int_op()(this->ctx, this->d_dvkb_indexes, max_nh * 4);
+        resmem_int_op()(this->d_dvkb_indexes, max_nh * 4);
     }
 
-    resmem_var_op()(this->ctx, this->hd_vq, max_nbeta * max_npw);
-    resmem_var_op()(this->ctx, this->hd_vq_deri, max_nbeta * max_npw);
+    resmem_var_op()(this->hd_vq, max_nbeta * max_npw);
+    resmem_var_op()(this->hd_vq_deri, max_nbeta * max_npw);
     const int _lmax = this->nlpp_->lmaxkb;
-    resmem_var_op()(this->ctx, this->hd_ylm, (_lmax + 1) * (_lmax + 1) * max_npw);
-    resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (_lmax + 1) * (_lmax + 1) * max_npw);
+    resmem_var_op()(this->hd_ylm, (_lmax + 1) * (_lmax + 1) * max_npw);
+    resmem_var_op()(this->hd_ylm_deri, 3 * (_lmax + 1) * (_lmax + 1) * max_npw);
     const int nks = this->kv_->get_nks();
-    resmem_var_op()(this->ctx, d_wk, nks);
-    syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wk, this->kv_->wk.data(), nks);
+    resmem_var_op()(d_wk, nks);
+    syncmem_var_h2d_op()(d_wk, this->kv_->wk.data(), nks);
 
     if (this->device == base_device::GpuDevice)
     {
-        resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc);
+        resmem_var_op()(d_wg, wg.nr * wg.nc);
+        syncmem_var_h2d_op()(d_wg, wg.c, wg.nr * wg.nc);
         if (p_ekb != nullptr)
         {
-            resmem_var_op()(this->ctx, d_ekb, p_ekb->nr * p_ekb->nc);
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, p_ekb->c, p_ekb->nr * p_ekb->nc);
+            resmem_var_op()(d_ekb, p_ekb->nr * p_ekb->nc);
+            syncmem_var_h2d_op()(d_ekb, p_ekb->c, p_ekb->nr * p_ekb->nc);
         }
-        resmem_int_op()(this->ctx, atom_nh, this->ntype);
-        resmem_int_op()(this->ctx, atom_na, this->ntype);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_na, h_atom_na.data(), this->ntype);
+        resmem_int_op()(atom_nh, this->ntype);
+        resmem_int_op()(atom_na, this->ntype);
+        syncmem_int_h2d_op()(atom_nh, h_atom_nh.data(), this->ntype);
+        syncmem_int_h2d_op()(atom_na, h_atom_na.data(), this->ntype);
 
-        resmem_var_op()(this->ctx, d_g_plus_k, max_npw * 5);
-        resmem_var_op()(this->ctx, d_pref, max_nh);
-        resmem_var_op()(this->ctx, d_vq_tab, this->nlpp_->tab.getSize());
-        resmem_complex_op()(this->ctx, d_pref_in, max_nh);
+        resmem_var_op()(d_g_plus_k, max_npw * 5);
+        resmem_var_op()(d_pref, max_nh);
+        resmem_var_op()(d_vq_tab, this->nlpp_->tab.getSize());
+        resmem_complex_op()(d_pref_in, max_nh);
 
         this->ppcell_vkb = this->nlpp_->template get_vkb_data<FPTYPE>();
     }
@@ -124,40 +124,40 @@ void FS_Nonlocal_tools<FPTYPE, Device>::delete_memory()
 {
     // delete memory
 
-    delmem_var_op()(this->ctx, hd_vq);
-    delmem_var_op()(this->ctx, hd_vq_deri);
-    delmem_var_op()(this->ctx, hd_ylm);
-    delmem_var_op()(this->ctx, hd_ylm_deri);
-    delmem_var_op()(this->ctx, d_wk);
+    delmem_var_op()(hd_vq);
+    delmem_var_op()(hd_vq_deri);
+    delmem_var_op()(hd_ylm);
+    delmem_var_op()(hd_ylm_deri);
+    delmem_var_op()(d_wk);
 
     // delete memory on GPU
     if (this->device == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, d_wg);
-        delmem_var_op()(this->ctx, d_ekb);
-        delmem_int_op()(this->ctx, atom_nh);
-        delmem_int_op()(this->ctx, atom_na);
-        delmem_var_op()(this->ctx, d_g_plus_k);
-        delmem_var_op()(this->ctx, d_pref);
-        delmem_var_op()(this->ctx, d_vq_tab);
-        delmem_complex_op()(this->ctx, this->d_pref_in);
-        delmem_int_op()(this->ctx, d_dvkb_indexes);
+        delmem_var_op()(d_wg);
+        delmem_var_op()(d_ekb);
+        delmem_int_op()(atom_nh);
+        delmem_int_op()(atom_na);
+        delmem_var_op()(d_g_plus_k);
+        delmem_var_op()(d_pref);
+        delmem_var_op()(d_vq_tab);
+        delmem_complex_op()(this->d_pref_in);
+        delmem_int_op()(d_dvkb_indexes);
     }
 
     if (becp != nullptr)
     {
-        delmem_complex_op()(this->ctx, becp);
-        delmem_complex_op()(this->ctx, hd_sk);
+        delmem_complex_op()(becp);
+        delmem_complex_op()(hd_sk);
     }
     if (dbecp != nullptr)
     {
-        delmem_complex_op()(this->ctx, dbecp);
+        delmem_complex_op()(dbecp);
     }
     if (this->pre_ik_f != -1)
     {
-        delmem_int_op()(this->ctx, gcar_zero_indexes);
-        delmem_complex_op()(this->ctx, vkb_save);
-        delmem_var_op()(this->ctx, gcar);
+        delmem_int_op()(gcar_zero_indexes);
+        delmem_complex_op()(vkb_save);
+        delmem_var_op()(gcar);
     }
 }
 
@@ -170,7 +170,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb(const int& ik, const int& nbdall
     const int size_becp = nbdall * npol * this->nkb;
     if (this->becp == nullptr)
     {
-        resmem_complex_op()(this->ctx, becp, size_becp);
+        resmem_complex_op()(becp, size_becp);
     }
 
     // prepare math tools
@@ -183,7 +183,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb(const int& ik, const int& nbdall
     this->g_plus_k = maths.cal_gk(ik, this->wfc_basis_);
     FPTYPE *gk = g_plus_k.data(), *vq_tb = this->nlpp_->tab.ptr;
     // calculate sk
-    resmem_complex_op()(ctx, hd_sk, this->ucell_->nat * npw);
+    resmem_complex_op()(hd_sk, this->ucell_->nat * npw);
     this->sf_->get_sk(ctx, ik, this->wfc_basis_, hd_sk);
     std::complex<FPTYPE>* d_sk = this->hd_sk;
     // prepare ylm，size: (lmax+1)^2 * this->max_npw
@@ -191,8 +191,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb(const int& ik, const int& nbdall
     maths.cal_ylm(lmax_, npw, g_plus_k.data(), hd_ylm);
     if (this->device == base_device::GpuDevice)
     {
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_g_plus_k, g_plus_k.data(), g_plus_k.size());
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vq_tab, this->nlpp_->tab.ptr, this->nlpp_->tab.getSize());
+        syncmem_var_h2d_op()(d_g_plus_k, g_plus_k.data(), g_plus_k.size());
+        syncmem_var_h2d_op()(d_vq_tab, this->nlpp_->tab.ptr, this->nlpp_->tab.getSize());
         gk = d_g_plus_k;
         vq_tb = d_vq_tab;
     }
@@ -229,8 +229,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb(const int& ik, const int& nbdall
                              this->dvkb_indexes.data());
         if (this->device == base_device::GpuDevice)
         {
-            syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
-            syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh);
+            syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
+            syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh);
         }
 
         for (int ia = 0; ia < h_atom_na[it]; ia++)
@@ -312,7 +312,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb_deri_s(const int& ik,
     const int size_becp = nbdall * npol * this->nkb;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, size_becp);
+        resmem_complex_op()(dbecp, size_becp);
     }
 
     // prepare math tools
@@ -383,8 +383,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb_deri_s(const int& ik,
                              this->dvkb_indexes.data());
         if (this->device == base_device::GpuDevice)
         {
-            syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
-            syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh);
+            syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
+            syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh);
         }
         for (int ia = 0; ia < h_atom_na[it]; ia++)
         {
@@ -538,7 +538,7 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb_deri_f(const int& ik, const int&
     const int size_becp = nbdall * npol * this->nkb;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
+        resmem_complex_op()(dbecp, 3 * size_becp);
     }
 
     const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
@@ -547,8 +547,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb_deri_f(const int& ik, const int&
     const int npw = this->wfc_basis_->npwk[ik];
     if (this->pre_ik_f == -1)
     {
-        resmem_var_op()(this->ctx, gcar, 3 * this->wfc_basis_->npwk_max);
-        resmem_int_op()(this->ctx, gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max);
+        resmem_var_op()(gcar, 3 * this->wfc_basis_->npwk_max);
+        resmem_int_op()(gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max);
     }
 
     if (this->pre_ik_f != ik)
@@ -730,10 +730,10 @@ void FS_Nonlocal_tools<FPTYPE, Device>::transfer_gcar(const int& npw, const int&
     }
     // prepare the memory for vkb_save
     const int max_count = std::max(gcar_zero_counts[0], std::max(gcar_zero_counts[1], gcar_zero_counts[2]));
-    resmem_complex_op()(this->ctx, this->vkb_save, this->nkb * max_count);
+    resmem_complex_op()(this->vkb_save, this->nkb * max_count);
     // transfer the gcar and gcar_zero_indexes to the device
-    syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar, gcar_tmp.data(), 3 * npw_max);
-    syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max);
+    syncmem_var_h2d_op()(gcar, gcar_tmp.data(), 3 * npw_max);
+    syncmem_int_h2d_op()(gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max);
 }
 
 // cal_force
diff --git a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp
index 38ccd9632c..f877eb1985 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp
@@ -246,7 +246,7 @@ void HamiltPW<T, Device>::sPsi(const T* psi_in, // psi
         return;
     }
 
-    syncmem_op()(this->ctx, this->ctx, spsi, psi_in, static_cast<size_t>(nbands * nrow));
+    syncmem_op()(spsi, psi_in, static_cast<size_t>(nbands * nrow));
     if (PARAM.globalv.use_uspp)
     {
         T* becp = nullptr;
@@ -254,7 +254,7 @@ void HamiltPW<T, Device>::sPsi(const T* psi_in, // psi
         // psi updated, thus update <beta|psi>
         if (this->ppcell->nkb > 0)
         {
-            resmem_complex_op()(this->ctx, becp, nbands * this->ppcell->nkb, "Hamilt<PW>::becp");
+            resmem_complex_op()(becp, nbands * this->ppcell->nkb, "Hamilt<PW>::becp");
             char transa = 'C';
             char transb = 'N';
             if (nbands == 1)
@@ -294,8 +294,8 @@ void HamiltPW<T, Device>::sPsi(const T* psi_in, // psi
             Parallel_Reduce::reduce_pool(becp, this->ppcell->nkb * nbands);
         }
 
-        resmem_complex_op()(this->ctx, ps, this->ppcell->nkb * nbands, "Hamilt<PW>::ps");
-        setmem_complex_op()(this->ctx, ps, 0, this->ppcell->nkb * nbands);
+        resmem_complex_op()(ps, this->ppcell->nkb * nbands, "Hamilt<PW>::ps");
+        setmem_complex_op()(ps, 0, this->ppcell->nkb * nbands);
 
         // spsi = psi + sum qq <beta|psi> |beta>
         if (PARAM.inp.noncolin)
@@ -316,7 +316,7 @@ void HamiltPW<T, Device>::sPsi(const T* psi_in, // psi
                 {
                     const int nh = atoms->ncpp.nh;
                     T* qqc = nullptr;
-                    resmem_complex_op()(this->ctx, qqc, nh * nh, "Hamilt<PW>::qqc");
+                    resmem_complex_op()(qqc, nh * nh, "Hamilt<PW>::qqc");
                     Real* qq_now = &qq_nt[it * this->ppcell->nhm * this->ppcell->nhm];
                     for (int i = 0; i < nh; i++)
                     {
@@ -344,7 +344,7 @@ void HamiltPW<T, Device>::sPsi(const T* psi_in, // psi
                                   &ps[this->ppcell->indv_ijkb0[iat]],
                                   this->ppcell->nkb);
                     }
-                    delmem_complex_op()(ctx, qqc);
+                    delmem_complex_op()(qqc);
                 }
             }
 
@@ -382,8 +382,8 @@ void HamiltPW<T, Device>::sPsi(const T* psi_in, // psi
                           nrow);
             }
         }
-        delmem_complex_op()(this->ctx, ps);
-        delmem_complex_op()(this->ctx, becp);
+        delmem_complex_op()(ps);
+        delmem_complex_op()(becp);
     }
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp
index 324d4fb752..7c06dfc154 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp
@@ -81,22 +81,22 @@ TEST_F(TestModuleHamiltEkinetic, ekinetic_pw_op_gpu)
 {
   double* gk2_dev = NULL;
   std::complex<double>* hpsi_dev = NULL, * psi_dev = NULL;
-  resize_memory_double_op()(gpu_ctx, gk2_dev, gk2.size());
-  resize_memory_complex_double_op()(gpu_ctx, psi_dev, psi.size());
+  resize_memory_double_op()(gk2_dev, gk2.size());
+  resize_memory_complex_double_op()(psi_dev, psi.size());
   std::vector<std::complex<double> > hpsi(expected_hpsi.size(), std::complex<double>(0.0, 0.0));
-  resize_memory_complex_double_op()(gpu_ctx, hpsi_dev, hpsi.size());
-  syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, hpsi_dev, hpsi.data(), hpsi.size());
-  syncmem_d_h2d_op()(gpu_ctx, cpu_ctx, gk2_dev, gk2.data(), gk2.size());
-  syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, psi_dev, psi.data(), psi.size());
+  resize_memory_complex_double_op()(hpsi_dev, hpsi.size());
+  syncmem_cd_h2d_op()(hpsi_dev, hpsi.data(), hpsi.size());
+  syncmem_d_h2d_op()(gk2_dev, gk2.data(), gk2.size());
+  syncmem_cd_h2d_op()(psi_dev, psi.data(), psi.size());
   // ekinetic_cpu_op()(cpu_ctx, band, dim, dim, tpiba2, gk2.data(), hpsi.data(), psi.data());
   ekinetic_gpu_op()(gpu_ctx, band, dim, dim, false, tpiba2, gk2_dev, hpsi_dev, psi_dev);
-  syncmem_cd_d2h_op()(cpu_ctx, gpu_ctx, hpsi.data(), hpsi_dev, hpsi.size());
+  syncmem_cd_d2h_op()(hpsi.data(), hpsi_dev, hpsi.size());
 
   for (int ii = 0; ii < hpsi.size(); ii++) {
     EXPECT_LT(fabs(hpsi[ii] - expected_hpsi[ii]), 1e-6);
   }
-  delete_memory_double_op()(gpu_ctx, gk2_dev);
-  delete_memory_complex_double_op()(gpu_ctx, psi_dev);
-  delete_memory_complex_double_op()(gpu_ctx, hpsi_dev);
+  delete_memory_double_op()(gk2_dev);
+  delete_memory_complex_double_op()(psi_dev);
+  delete_memory_complex_double_op()(hpsi_dev);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
index 0507ff3358..be237b64ba 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp
@@ -2919,12 +2919,12 @@ TEST_F(TestSrcPWForceMultiDevice, cal_vkb1_nl_op_gpu)
     std::vector<std::complex<double>> res = vkb1;
     std::complex<double>*d_res = nullptr, *d_vkb = nullptr;
     double* d_gcar = nullptr;
-    resmem_complex_op()(gpu_ctx, d_res, res.size());
-    resmem_complex_op()(gpu_ctx, d_vkb, vkb.size());
-    resmem_var_op()(gpu_ctx, d_gcar, gcar.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_vkb, vkb.data(), vkb.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_gcar, gcar.data(), gcar.size());
+    resmem_complex_op()(d_res, res.size());
+    resmem_complex_op()(d_vkb, vkb.size());
+    resmem_var_op()(d_gcar, gcar.size());
+    syncmem_complex_h2d_op()(d_res, res.data(), res.size());
+    syncmem_complex_h2d_op()(d_vkb, vkb.data(), vkb.size());
+    syncmem_var_h2d_op()(d_gcar, gcar.data(), gcar.size());
 
     hamilt::cal_vkb1_nl_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                               nkb,
@@ -2936,16 +2936,16 @@ TEST_F(TestSrcPWForceMultiDevice, cal_vkb1_nl_op_gpu)
                                                               d_vkb,
                                                               d_gcar,
                                                               d_res);
-    syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
+    syncmem_complex_d2h_op()(res.data(), d_res, res.size());
 
     for (int ii = 0; ii < res.size(); ii++)
     {
         EXPECT_LT(fabs(res[ii] - expected_vkb1[ii]), 6e-5);
     }
 
-    delmem_complex_op()(gpu_ctx, d_res);
-    delmem_complex_op()(gpu_ctx, d_vkb);
-    delmem_var_op()(gpu_ctx, d_gcar);
+    delmem_complex_op()(d_res);
+    delmem_complex_op()(d_vkb);
+    delmem_var_op()(d_gcar);
 }
 
 TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
@@ -2953,28 +2953,28 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
     std::vector<double> res(expected_force.size(), 0);
     double *d_res = nullptr, *d_wg = nullptr, *d_deeq = nullptr;
     double *d_ekb = nullptr, *d_qq_nt = nullptr;
-    resmem_var_op()(gpu_ctx, d_wg, wg.size());
-    resmem_var_op()(gpu_ctx, d_res, res.size());
-    resmem_var_op()(gpu_ctx, d_deeq, deeq.size());
-    resmem_var_op()(gpu_ctx, d_ekb, ekb.size());
-    resmem_var_op()(gpu_ctx, d_qq_nt, qq_nt.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_wg, wg.data(), wg.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_deeq, deeq.data(), deeq.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_ekb, ekb.data(), ekb.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_qq_nt, qq_nt.data(), qq_nt.size());
+    resmem_var_op()(d_wg, wg.size());
+    resmem_var_op()(d_res, res.size());
+    resmem_var_op()(d_deeq, deeq.size());
+    resmem_var_op()(d_ekb, ekb.size());
+    resmem_var_op()(d_qq_nt, qq_nt.size());
+    syncmem_var_h2d_op()(d_wg, wg.data(), wg.size());
+    syncmem_var_h2d_op()(d_res, res.data(), res.size());
+    syncmem_var_h2d_op()(d_deeq, deeq.data(), deeq.size());
+    syncmem_var_h2d_op()(d_ekb, ekb.data(), ekb.size());
+    syncmem_var_h2d_op()(d_qq_nt, qq_nt.data(), qq_nt.size());
 
     int *d_atom_nh = nullptr, *d_atom_na = nullptr;
-    resmem_int_op()(gpu_ctx, d_atom_nh, atom_nh.size());
-    resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nh, atom_nh.data(), atom_nh.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size());
+    resmem_int_op()(d_atom_nh, atom_nh.size());
+    resmem_int_op()(d_atom_na, atom_na.size());
+    syncmem_int_h2d_op()(d_atom_nh, atom_nh.data(), atom_nh.size());
+    syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size());
 
     std::complex<double>*d_becp = nullptr, *d_dbecp = nullptr;
-    resmem_complex_op()(gpu_ctx, d_becp, becp.size());
-    resmem_complex_op()(gpu_ctx, d_dbecp, dbecp.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_becp, becp.data(), becp.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_dbecp, dbecp.data(), dbecp.size());
+    resmem_complex_op()(d_becp, becp.size());
+    resmem_complex_op()(d_dbecp, dbecp.size());
+    syncmem_complex_h2d_op()(d_becp, becp.data(), becp.size());
+    syncmem_complex_h2d_op()(d_dbecp, dbecp.data(), dbecp.size());
 
     hamilt::cal_force_nl_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                                multi_proj,
@@ -2998,23 +2998,23 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu)
                                                                d_becp,
                                                                d_dbecp,
                                                                d_res);
-    syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
+    syncmem_var_d2h_op()(res.data(), d_res, res.size());
 
     for (int ii = 0; ii < res.size(); ii++)
     {
         EXPECT_LT(fabs(res[ii] - expected_force[ii]), 6e-5);
     }
 
-    delmem_var_op()(gpu_ctx, d_wg);
-    delmem_var_op()(gpu_ctx, d_res);
-    delmem_var_op()(gpu_ctx, d_deeq);
-    delmem_var_op()(gpu_ctx, d_ekb);
-    delmem_var_op()(gpu_ctx, d_qq_nt);
+    delmem_var_op()(d_wg);
+    delmem_var_op()(d_res);
+    delmem_var_op()(d_deeq);
+    delmem_var_op()(d_ekb);
+    delmem_var_op()(d_qq_nt);
 
-    delmem_int_op()(gpu_ctx, d_atom_nh);
-    delmem_int_op()(gpu_ctx, d_atom_na);
+    delmem_int_op()(d_atom_nh);
+    delmem_int_op()(d_atom_na);
 
-    delmem_complex_op()(gpu_ctx, d_becp);
-    delmem_complex_op()(gpu_ctx, d_dbecp);
+    delmem_complex_op()(d_becp);
+    delmem_complex_op()(d_dbecp);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp
index d9e9244004..85caa61f4b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp
@@ -60,24 +60,24 @@ TEST_F(TestModuleHamiltMeta, meta_pw_op_gpu)
     std::vector<std::complex<double>> res(expected_out.size(), std::complex<double> {0, 0});
     double * d_gcar = nullptr, * d_kvec_c = nullptr;
     std::complex<double>* d_in = nullptr, * d_res = nullptr;
-    resmem_var_op()(gpu_ctx, d_gcar, gcar.size());
-    resmem_var_op()(gpu_ctx, d_kvec_c, kvec_c.size());
-    resmem_complex_op()(gpu_ctx, d_in, in.size());
-    resmem_complex_op()(gpu_ctx, d_res, res.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_gcar, gcar.data(), gcar.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_kvec_c, kvec_c.data(), kvec_c.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in, in.data(), in.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
+    resmem_var_op()(d_gcar, gcar.size());
+    resmem_var_op()(d_kvec_c, kvec_c.size());
+    resmem_complex_op()(d_in, in.size());
+    resmem_complex_op()(d_res, res.size());
+    syncmem_var_h2d_op()(d_gcar, gcar.data(), gcar.size());
+    syncmem_var_h2d_op()(d_kvec_c, kvec_c.data(), kvec_c.size());
+    syncmem_complex_h2d_op()(d_in, in.data(), in.size());
+    syncmem_complex_h2d_op()(d_res, res.data(), res.size());
 
     meta_gpu_op()(gpu_ctx, ik, pol, npw, npwx, tpiba, d_gcar, d_kvec_c, d_in, d_res);
 
-    syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
+    syncmem_complex_d2h_op()(res.data(), d_res, res.size());
     for (int ii = 0; ii < res.size(); ii++) {
         EXPECT_LT(fabs(res[ii] - expected_out[ii]), 6e-5);
     }
-    delmem_var_op()(gpu_ctx, d_gcar);
-    delmem_var_op()(gpu_ctx, d_kvec_c);
-    delmem_complex_op()(gpu_ctx, d_in);
-    delmem_complex_op()(gpu_ctx, d_res);
+    delmem_var_op()(d_gcar);
+    delmem_var_op()(d_kvec_c);
+    delmem_complex_op()(d_in);
+    delmem_complex_op()(d_res);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp
index 8591182d4b..47deaec255 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp
@@ -127,12 +127,12 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_op_gpu)
   double* deeq_dev = NULL;
   std::complex<double>* ps_dev = NULL, * becp_dev = NULL;
   std::vector<std::complex<double>> ps(expected_ps.size(), std::complex<double>(0.0, 0.0));
-  resize_memory_double_op()(gpu_ctx, deeq_dev, deeq.size());
-  resize_memory_complex_double_op()(gpu_ctx, ps_dev, ps.size());
-  resize_memory_complex_double_op()(gpu_ctx, becp_dev, becp.size());
-  syncmem_d_h2d_op()(gpu_ctx, cpu_ctx, deeq_dev, deeq.data(), deeq.size());
-  syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, ps_dev, ps.data(), ps.size());
-  syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, becp_dev, becp.data(), becp.size());
+  resize_memory_double_op()(deeq_dev, deeq.size());
+  resize_memory_complex_double_op()(ps_dev, ps.size());
+  resize_memory_complex_double_op()(becp_dev, becp.size());
+  syncmem_d_h2d_op()(deeq_dev, deeq.data(), deeq.size());
+  syncmem_cd_h2d_op()(ps_dev, ps.data(), ps.size());
+  syncmem_cd_h2d_op()(becp_dev, becp.data(), becp.size());
   nonlocal_gpu_op()(
       gpu_ctx, 
       l1, l2, l3, 
@@ -141,15 +141,15 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_op_gpu)
       deeq_dev,
       ps_dev, becp_dev);
 
-  syncmem_cd_d2h_op()(cpu_ctx, gpu_ctx, ps.data(), ps_dev, ps.size());
+  syncmem_cd_d2h_op()(ps.data(), ps_dev, ps.size());
   for (int ii = 0; ii < ps.size(); ii++) {
     EXPECT_LT(fabs(ps[ii] - expected_ps[ii]), 5 * 1e-6);
   }
   EXPECT_EQ(sum, expected_sum);
   EXPECT_EQ(iat, expected_iat);
-  delete_memory_double_op()(gpu_ctx, deeq_dev);
-  delete_memory_complex_double_op()(gpu_ctx, ps_dev);
-  delete_memory_complex_double_op()(gpu_ctx, becp_dev);
+  delete_memory_double_op()(deeq_dev);
+  delete_memory_complex_double_op()(ps_dev);
+  delete_memory_complex_double_op()(becp_dev);
 }
 
 TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_spin_op_gpu)
@@ -157,12 +157,12 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_spin_op_gpu)
   sum = 0; iat = 0;
   std::complex<double>* ps_dev = NULL, * becp_dev = NULL, * deeq_dev = NULL;
   std::vector<std::complex<double>> ps(expected_ps.size(), std::complex<double>(0.0, 0.0));
-  resize_memory_complex_double_op()(gpu_ctx, deeq_dev, deeq_spin.size());
-  resize_memory_complex_double_op()(gpu_ctx, ps_dev, ps.size());
-  resize_memory_complex_double_op()(gpu_ctx, becp_dev, becp_spin.size());
-  syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, deeq_dev, deeq_spin.data(), deeq_spin.size());
-  syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, ps_dev, ps.data(), ps.size());
-  syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, becp_dev, becp_spin.data(), becp_spin.size());
+  resize_memory_complex_double_op()(deeq_dev, deeq_spin.size());
+  resize_memory_complex_double_op()(ps_dev, ps.size());
+  resize_memory_complex_double_op()(becp_dev, becp_spin.size());
+  syncmem_cd_h2d_op()(deeq_dev, deeq_spin.data(), deeq_spin.size());
+  syncmem_cd_h2d_op()(ps_dev, ps.data(), ps.size());
+  syncmem_cd_h2d_op()(becp_dev, becp_spin.data(), becp_spin.size());
   nonlocal_gpu_op()(
       gpu_ctx, 
       l1, l2_spin, l3, 
@@ -171,14 +171,14 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_spin_op_gpu)
       deeq_dev,
       ps_dev, becp_dev);
 
-  syncmem_cd_d2h_op()(cpu_ctx, gpu_ctx, ps.data(), ps_dev, ps.size());
+  syncmem_cd_d2h_op()(ps.data(), ps_dev, ps.size());
   for (int ii = 0; ii < ps.size(); ii++) {
     EXPECT_LT(fabs(ps[ii] - expected_ps_spin[ii]), 5 * 1e-6);
   }
   EXPECT_EQ(sum, expected_sum);
   EXPECT_EQ(iat, expected_iat);
-  delete_memory_complex_double_op()(gpu_ctx, deeq_dev);
-  delete_memory_complex_double_op()(gpu_ctx, ps_dev);
-  delete_memory_complex_double_op()(gpu_ctx, becp_dev);
+  delete_memory_complex_double_op()(deeq_dev);
+  delete_memory_complex_double_op()(ps_dev);
+  delete_memory_complex_double_op()(becp_dev);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
index cbf434da0c..a3be95fce8 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp
@@ -137,24 +137,24 @@ TEST(TestSrcPWStressMultiDevice, cal_dbecp_noevc_nl_op_gpu)
     std::complex<double> * d_vkb0i = nullptr, * d_vkb0j = nullptr, * d_vkb = nullptr,
                 * d_vkb1 = nullptr, * d_vkb2 = nullptr, * d_dbecp_noevc = nullptr;
     double * d_gcar = nullptr, * d_kvec_c = nullptr;
-    resmem_zd_op()(gpu_ctx, d_vkb0i, vkb0i.size());
-    resmem_zd_op()(gpu_ctx, d_vkb0j, vkb0j.size());
-    resmem_zd_op()(gpu_ctx, d_vkb, vkb.size());
-    resmem_zd_op()(gpu_ctx, d_vkb1, vkb1.size());
-    resmem_zd_op()(gpu_ctx, d_vkb2, vkb2.size());
-    resmem_zd_op()(gpu_ctx, d_dbecp_noevc, dbecp_noevc.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb0i, vkb0i.data(), vkb0i.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb0j, vkb0j.data(), vkb0j.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb, vkb.data(), vkb.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb1, vkb1.data(), vkb1.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb2, vkb2.data(), vkb2.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_dbecp_noevc, dbecp_noevc.data(), dbecp_noevc.size());
-
-    resmem_dd_op()(gpu_ctx, d_gcar, gcar.size());
-    resmem_dd_op()(gpu_ctx, d_kvec_c, kvec_c.size());
-
-    syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_gcar, gcar.data(), gcar.size());
-    syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_kvec_c, kvec_c.data(), kvec_c.size());
+    resmem_zd_op()(d_vkb0i, vkb0i.size());
+    resmem_zd_op()(d_vkb0j, vkb0j.size());
+    resmem_zd_op()(d_vkb, vkb.size());
+    resmem_zd_op()(d_vkb1, vkb1.size());
+    resmem_zd_op()(d_vkb2, vkb2.size());
+    resmem_zd_op()(d_dbecp_noevc, dbecp_noevc.size());
+    syncmem_z2z_h2d_op()(d_vkb0i, vkb0i.data(), vkb0i.size());
+    syncmem_z2z_h2d_op()(d_vkb0j, vkb0j.data(), vkb0j.size());
+    syncmem_z2z_h2d_op()(d_vkb, vkb.data(), vkb.size());
+    syncmem_z2z_h2d_op()(d_vkb1, vkb1.data(), vkb1.size());
+    syncmem_z2z_h2d_op()(d_vkb2, vkb2.data(), vkb2.size());
+    syncmem_z2z_h2d_op()(d_dbecp_noevc, dbecp_noevc.data(), dbecp_noevc.size());
+
+    resmem_dd_op()(d_gcar, gcar.size());
+    resmem_dd_op()(d_kvec_c, kvec_c.size());
+
+    syncmem_d2d_h2d_op()(d_gcar, gcar.data(), gcar.size());
+    syncmem_d2d_h2d_op()(d_kvec_c, kvec_c.data(), kvec_c.size());
 
     hamilt::cal_dbecp_noevc_nl_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                                      ipol,
@@ -173,21 +173,21 @@ TEST(TestSrcPWStressMultiDevice, cal_dbecp_noevc_nl_op_gpu)
                                                                      d_vkb2,
                                                                      d_dbecp_noevc);
 
-    syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, dbecp_noevc.data(), d_dbecp_noevc, dbecp_noevc.size());
+    syncmem_z2z_d2h_op()(dbecp_noevc.data(), d_dbecp_noevc, dbecp_noevc.size());
 
     for (int ii = 0; ii < dbecp_noevc.size(); ii++) {
         EXPECT_LT(fabs(dbecp_noevc[ii] - expected_dbecpnoevc[ii]), 6e-5);
     }
 
-    delmem_zd_op()(gpu_ctx, d_vkb0i);
-    delmem_zd_op()(gpu_ctx, d_vkb0j);
-    delmem_zd_op()(gpu_ctx, d_vkb);
-    delmem_zd_op()(gpu_ctx, d_vkb1);
-    delmem_zd_op()(gpu_ctx, d_vkb2);
-    delmem_zd_op()(gpu_ctx, d_dbecp_noevc);
+    delmem_zd_op()(d_vkb0i);
+    delmem_zd_op()(d_vkb0j);
+    delmem_zd_op()(d_vkb);
+    delmem_zd_op()(d_vkb1);
+    delmem_zd_op()(d_vkb2);
+    delmem_zd_op()(d_dbecp_noevc);
 
-    delmem_dd_op()(gpu_ctx, d_gcar);
-    delmem_dd_op()(gpu_ctx, d_kvec_c);
+    delmem_dd_op()(d_gcar);
+    delmem_dd_op()(d_kvec_c);
 }
 
 TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu)
@@ -236,31 +236,31 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu)
     double * d_wg = nullptr, * d_deeq = nullptr, * d_stress = nullptr;
     double * d_ekb = nullptr, * d_qq_nt = nullptr;
     int * d_atom_nh = nullptr, * d_atom_na = nullptr;
-    resmem_zd_op()(gpu_ctx, d_becp, becp.size());
-    resmem_zd_op()(gpu_ctx, d_dbecp, dbecp.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_becp, becp.data(), becp.size());
-    syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_dbecp, dbecp.data(), dbecp.size());
-
-    resmem_dd_op()(gpu_ctx, d_wg, wg.size());
-    resmem_dd_op()(gpu_ctx, d_deeq, deeq.size());
-    resmem_dd_op()(gpu_ctx, d_stress, stress.size());
-    resmem_dd_op()(gpu_ctx, d_ekb, ekb.size());
-    resmem_dd_op()(gpu_ctx, d_qq_nt, qq_nt.size());
-    syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_wg, wg.data(), wg.size());
-    syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_deeq, deeq.data(), deeq.size());
-    syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_stress, stress.data(), stress.size());
-    syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_ekb, ekb.data(), ekb.size());
-    syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_qq_nt, qq_nt.data(), qq_nt.size());
+    resmem_zd_op()(d_becp, becp.size());
+    resmem_zd_op()(d_dbecp, dbecp.size());
+    syncmem_z2z_h2d_op()(d_becp, becp.data(), becp.size());
+    syncmem_z2z_h2d_op()(d_dbecp, dbecp.data(), dbecp.size());
+
+    resmem_dd_op()(d_wg, wg.size());
+    resmem_dd_op()(d_deeq, deeq.size());
+    resmem_dd_op()(d_stress, stress.size());
+    resmem_dd_op()(d_ekb, ekb.size());
+    resmem_dd_op()(d_qq_nt, qq_nt.size());
+    syncmem_d2d_h2d_op()(d_wg, wg.data(), wg.size());
+    syncmem_d2d_h2d_op()(d_deeq, deeq.data(), deeq.size());
+    syncmem_d2d_h2d_op()(d_stress, stress.data(), stress.size());
+    syncmem_d2d_h2d_op()(d_ekb, ekb.data(), ekb.size());
+    syncmem_d2d_h2d_op()(d_qq_nt, qq_nt.data(), qq_nt.size());
 
     using delmem_int_op = base_device::memory::delete_memory_op<int, base_device::DEVICE_GPU>;
     using resmem_int_op = base_device::memory::resize_memory_op<int, base_device::DEVICE_GPU>;
     using syncmem_int_h2d_op
         = base_device::memory::synchronize_memory_op<int, base_device::DEVICE_GPU, base_device::DEVICE_CPU>;
 
-    resmem_int_op()(gpu_ctx, d_atom_nh, atom_nh.size());
-    resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nh, atom_nh.data(), atom_nh.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size());
+    resmem_int_op()(d_atom_nh, atom_nh.size());
+    resmem_int_op()(d_atom_na, atom_na.size());
+    syncmem_int_h2d_op()(d_atom_nh, atom_nh.data(), atom_nh.size());
+    syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size());
 
     hamilt::cal_stress_nl_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                                 multi_proj,
@@ -284,22 +284,22 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu)
                                                                 d_dbecp,
                                                                 d_stress);
 
-    syncmem_d2d_d2h_op()(cpu_ctx, gpu_ctx, stress.data(), d_stress, stress.size());
+    syncmem_d2d_d2h_op()(stress.data(), d_stress, stress.size());
 
     for (int ii = 0; ii < stress.size(); ii++) {
         EXPECT_LT(fabs(stress[ii] - expected_stress[ii]), 6e-5);
     }
 
-    delmem_zd_op()(gpu_ctx, d_becp);
-    delmem_zd_op()(gpu_ctx, d_dbecp);
+    delmem_zd_op()(d_becp);
+    delmem_zd_op()(d_dbecp);
 
-    delmem_dd_op()(gpu_ctx, d_wg);
-    delmem_dd_op()(gpu_ctx, d_deeq);
-    delmem_dd_op()(gpu_ctx, d_stress);
-    delmem_dd_op()(gpu_ctx, d_ekb);
-    delmem_dd_op()(gpu_ctx, d_qq_nt);
+    delmem_dd_op()(d_wg);
+    delmem_dd_op()(d_deeq);
+    delmem_dd_op()(d_stress);
+    delmem_dd_op()(d_ekb);
+    delmem_dd_op()(d_qq_nt);
 
-    delmem_int_op()(gpu_ctx, d_atom_nh);
-    delmem_int_op()(gpu_ctx, d_atom_na);
+    delmem_int_op()(d_atom_nh);
+    delmem_int_op()(d_atom_na);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp
index 318646f063..56c96157fd 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp
@@ -89,19 +89,19 @@ TEST_F(TestModuleHamiltVeff, veff_pw_op_gpu)
     std::vector<std::complex<double>> res = out;
     double* d_in = NULL;
     std::complex<double>* d_res = NULL;
-    resize_memory_double_op()(gpu_ctx, d_in, in.size());
-    resize_memory_complex_op()(gpu_ctx, d_res, res.size());
-    syncmem_double_h2d_op()(gpu_ctx, cpu_ctx, d_in, in.data(), in.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
+    resize_memory_double_op()(d_in, in.size());
+    resize_memory_complex_op()(d_res, res.size());
+    syncmem_double_h2d_op()(d_in, in.data(), in.size());
+    syncmem_complex_h2d_op()(d_res, res.data(), res.size());
 
     veff_gpu_op()(gpu_ctx, this->size, d_res, d_in);
 
-    syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
+    syncmem_complex_d2h_op()(res.data(), d_res, res.size());
     for (int ii = 0; ii < res.size(); ii++) {
         EXPECT_LT(fabs(res[ii] - expected_out[ii]), 6e-5);
     }
-    delete_memory_double_op()(gpu_ctx, d_in);
-    delete_memory_complex_op()(gpu_ctx, d_res);
+    delete_memory_double_op()(d_in);
+    delete_memory_complex_op()(d_res);
 }
 
 TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu)
@@ -112,12 +112,12 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu)
     std::vector<std::complex<double>> res1 = out1_spin;
     double* d_in = NULL;
     std::complex<double>* d_res = NULL, * d_res1 = NULL;
-    resize_memory_double_op()(gpu_ctx, d_in, in_spin.size());
-    resize_memory_complex_op()(gpu_ctx, d_res, res.size());
-    resize_memory_complex_op()(gpu_ctx, d_res1, res1.size());
-    syncmem_double_h2d_op()(gpu_ctx, cpu_ctx, d_in, in_spin.data(), in_spin.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res1, res1.data(), res1.size());
+    resize_memory_double_op()(d_in, in_spin.size());
+    resize_memory_complex_op()(d_res, res.size());
+    resize_memory_complex_op()(d_res1, res1.size());
+    syncmem_double_h2d_op()(d_in, in_spin.data(), in_spin.size());
+    syncmem_complex_h2d_op()(d_res, res.data(), res.size());
+    syncmem_complex_h2d_op()(d_res1, res1.data(), res1.size());
 
     const double * in_[4];
     for (int ii = 0; ii < 4; ii++) {
@@ -126,14 +126,14 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu)
 
     veff_gpu_op()(gpu_ctx, this->size, d_res, d_res1, in_);
 
-    syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size());
-    syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res1.data(), d_res1, res1.size());
+    syncmem_complex_d2h_op()(res.data(), d_res, res.size());
+    syncmem_complex_d2h_op()(res1.data(), d_res1, res1.size());
     for (int ii = 0; ii < res.size(); ii++) {
         EXPECT_LT(fabs(res[ii] - expected_out_spin[ii]), 7.5e-5);
         EXPECT_LT(fabs(res1[ii] - expected_out1_spin[ii]), 6e-5);
     }
-    delete_memory_double_op()(gpu_ctx, d_in);
-    delete_memory_complex_op()(gpu_ctx, d_res);
-    delete_memory_complex_op()(gpu_ctx, d_res1);
+    delete_memory_double_op()(d_in);
+    delete_memory_complex_op()(d_res);
+    delete_memory_complex_op()(d_res1);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp
index 428304c52d..be5e6a8a68 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp
@@ -4049,34 +4049,34 @@ TEST_F(TestSrcPWVnlMultiDevice, cal_vnl_op_gpu)
            *d_tab = nullptr, *d_vkb1 = nullptr;
     std::complex<double>*d_sk = nullptr, *d_vkb = nullptr;
 
-    resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size());
-    resmem_int_op()(gpu_ctx, d_atom_nb, atom_nb.size());
-    resmem_int_op()(gpu_ctx, d_atom_nh, atom_nh.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nb, atom_nb.data(), atom_nb.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nh, atom_nh.data(), atom_nh.size());
+    resmem_int_op()(d_atom_na, atom_na.size());
+    resmem_int_op()(d_atom_nb, atom_nb.size());
+    resmem_int_op()(d_atom_nh, atom_nh.size());
+    syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size());
+    syncmem_int_h2d_op()(d_atom_nb, atom_nb.data(), atom_nb.size());
+    syncmem_int_h2d_op()(d_atom_nh, atom_nh.data(), atom_nh.size());
 
-    resmem_var_op()(gpu_ctx, d_gk, gk.size());
-    resmem_var_op()(gpu_ctx, d_ylm, ylm.size());
-    resmem_var_op()(gpu_ctx, d_indv, indv.size());
-    resmem_var_op()(gpu_ctx, d_nhtol, nhtol.size());
-    resmem_var_op()(gpu_ctx, d_nhtolm, nhtolm.size());
-    resmem_var_op()(gpu_ctx, d_tab, tab.size());
-    resmem_var_op()(gpu_ctx, d_vkb1, vkb1.size());
+    resmem_var_op()(d_gk, gk.size());
+    resmem_var_op()(d_ylm, ylm.size());
+    resmem_var_op()(d_indv, indv.size());
+    resmem_var_op()(d_nhtol, nhtol.size());
+    resmem_var_op()(d_nhtolm, nhtolm.size());
+    resmem_var_op()(d_tab, tab.size());
+    resmem_var_op()(d_vkb1, vkb1.size());
 
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_gk, gk.data(), gk.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_ylm, ylm.data(), ylm.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_indv, indv.data(), indv.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_nhtol, nhtol.data(), nhtol.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_nhtolm, nhtolm.data(), nhtolm.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_tab, tab.data(), tab.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_vkb1, vkb1.data(), vkb1.size());
+    syncmem_var_h2d_op()(d_gk, gk.data(), gk.size());
+    syncmem_var_h2d_op()(d_ylm, ylm.data(), ylm.size());
+    syncmem_var_h2d_op()(d_indv, indv.data(), indv.size());
+    syncmem_var_h2d_op()(d_nhtol, nhtol.data(), nhtol.size());
+    syncmem_var_h2d_op()(d_nhtolm, nhtolm.data(), nhtolm.size());
+    syncmem_var_h2d_op()(d_tab, tab.data(), tab.size());
+    syncmem_var_h2d_op()(d_vkb1, vkb1.data(), vkb1.size());
 
-    resmem_complex_op()(gpu_ctx, d_sk, sk.size());
-    resmem_complex_op()(gpu_ctx, d_vkb, vkb.size());
+    resmem_complex_op()(d_sk, sk.size());
+    resmem_complex_op()(d_vkb, vkb.size());
 
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_sk, sk.data(), sk.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_vkb, vkb.data(), vkb.size());
+    syncmem_complex_h2d_op()(d_sk, sk.data(), sk.size());
+    syncmem_complex_h2d_op()(d_vkb, vkb.data(), vkb.size());
 
     hamilt::cal_vnl_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                           ntype,
@@ -4101,26 +4101,26 @@ TEST_F(TestSrcPWVnlMultiDevice, cal_vnl_op_gpu)
                                                           d_sk,
                                                           d_vkb);
 
-    syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, vkb.data(), d_vkb, vkb.size());
+    syncmem_complex_d2h_op()(vkb.data(), d_vkb, vkb.size());
 
     for (int ii = 0; ii < vkb.size(); ii++)
     {
         EXPECT_LT(fabs(vkb[ii] - expected_vkb[ii]), 6e-5);
     }
 
-    delmem_int_op()(gpu_ctx, d_atom_na);
-    delmem_int_op()(gpu_ctx, d_atom_nh);
-    delmem_int_op()(gpu_ctx, d_atom_nb);
+    delmem_int_op()(d_atom_na);
+    delmem_int_op()(d_atom_nh);
+    delmem_int_op()(d_atom_nb);
 
-    delmem_var_op()(gpu_ctx, d_gk);
-    delmem_var_op()(gpu_ctx, d_ylm);
-    delmem_var_op()(gpu_ctx, d_indv);
-    delmem_var_op()(gpu_ctx, d_nhtol);
-    delmem_var_op()(gpu_ctx, d_nhtolm);
-    delmem_var_op()(gpu_ctx, d_tab);
-    delmem_var_op()(gpu_ctx, d_vkb1);
+    delmem_var_op()(d_gk);
+    delmem_var_op()(d_ylm);
+    delmem_var_op()(d_indv);
+    delmem_var_op()(d_nhtol);
+    delmem_var_op()(d_nhtolm);
+    delmem_var_op()(d_tab);
+    delmem_var_op()(d_vkb1);
 
-    delmem_complex_op()(gpu_ctx, d_sk);
-    delmem_complex_op()(gpu_ctx, d_vkb);
+    delmem_complex_op()(d_sk);
+    delmem_complex_op()(d_vkb);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp
index 8b46679d67..2463234c31 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp
@@ -412,26 +412,26 @@ TEST_F(TestSrcPWWfMultiDevice, cal_sk_op_gpu)
     double * d_kvec_c = nullptr, * d_atom_tau = nullptr;
     std::complex<double> * d_sk = nullptr, * d_eigts1 = nullptr, * d_eigts2 = nullptr, * d_eigts3 = nullptr;
 
-    resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size());
-    resmem_int_op()(gpu_ctx, d_igl2isz, igl2isz.size());
-    resmem_int_op()(gpu_ctx, d_is2fftixy, is2fftixy.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_igl2isz, igl2isz.data(), igl2isz.size());
-    syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_is2fftixy, is2fftixy.data(), is2fftixy.size());
+    resmem_int_op()(d_atom_na, atom_na.size());
+    resmem_int_op()(d_igl2isz, igl2isz.size());
+    resmem_int_op()(d_is2fftixy, is2fftixy.size());
+    syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size());
+    syncmem_int_h2d_op()(d_igl2isz, igl2isz.data(), igl2isz.size());
+    syncmem_int_h2d_op()(d_is2fftixy, is2fftixy.data(), is2fftixy.size());
 
-    resmem_var_op()(gpu_ctx, d_kvec_c, kvec_c.size());
-    resmem_var_op()(gpu_ctx, d_atom_tau, atom_tau.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_kvec_c, kvec_c.data(), kvec_c.size());
-    syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_atom_tau, atom_tau.data(), atom_tau.size());
+    resmem_var_op()(d_kvec_c, kvec_c.size());
+    resmem_var_op()(d_atom_tau, atom_tau.size());
+    syncmem_var_h2d_op()(d_kvec_c, kvec_c.data(), kvec_c.size());
+    syncmem_var_h2d_op()(d_atom_tau, atom_tau.data(), atom_tau.size());
 
-    resmem_complex_op()(gpu_ctx, d_sk, sk.size());
-    resmem_complex_op()(gpu_ctx, d_eigts1, eigts1.size());
-    resmem_complex_op()(gpu_ctx, d_eigts2, eigts2.size());
-    resmem_complex_op()(gpu_ctx, d_eigts3, eigts3.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_sk, sk.data(), sk.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_eigts1, eigts1.data(), eigts1.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_eigts2, eigts2.data(), eigts2.size());
-    syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_eigts3, eigts3.data(), eigts3.size());
+    resmem_complex_op()(d_sk, sk.size());
+    resmem_complex_op()(d_eigts1, eigts1.size());
+    resmem_complex_op()(d_eigts2, eigts2.size());
+    resmem_complex_op()(d_eigts3, eigts3.size());
+    syncmem_complex_h2d_op()(d_sk, sk.data(), sk.size());
+    syncmem_complex_h2d_op()(d_eigts1, eigts1.data(), eigts1.size());
+    syncmem_complex_h2d_op()(d_eigts2, eigts2.data(), eigts2.size());
+    syncmem_complex_h2d_op()(d_eigts3, eigts3.data(), eigts3.size());
 
     hamilt::cal_sk_op<double, base_device::DEVICE_GPU>()(gpu_ctx,
                                                          ik,
@@ -459,22 +459,22 @@ TEST_F(TestSrcPWWfMultiDevice, cal_sk_op_gpu)
                                                          d_eigts3,
                                                          d_sk);
 
-    syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, sk.data(), d_sk, sk.size());
+    syncmem_complex_d2h_op()(sk.data(), d_sk, sk.size());
 
     for (int ii = 0; ii < sk.size(); ii++) {
         EXPECT_LT(fabs(sk[ii] - expected_sk[ii]), 6e-5);
     }
 
-    delmem_int_op()(gpu_ctx, d_atom_na);
-    delmem_int_op()(gpu_ctx, d_igl2isz);
-    delmem_int_op()(gpu_ctx, d_is2fftixy);
+    delmem_int_op()(d_atom_na);
+    delmem_int_op()(d_igl2isz);
+    delmem_int_op()(d_is2fftixy);
 
-    delmem_var_op()(gpu_ctx, d_kvec_c);
-    delmem_var_op()(gpu_ctx, d_atom_tau);
+    delmem_var_op()(d_kvec_c);
+    delmem_var_op()(d_atom_tau);
 
-    delmem_complex_op()(gpu_ctx, d_sk);
-    delmem_complex_op()(gpu_ctx, d_eigts1);
-    delmem_complex_op()(gpu_ctx, d_eigts2);
-    delmem_complex_op()(gpu_ctx, d_eigts3);
+    delmem_complex_op()(d_sk);
+    delmem_complex_op()(d_eigts1);
+    delmem_complex_op()(d_eigts2);
+    delmem_complex_op()(d_eigts3);
 }
 #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp
index aa28b5abe2..79649fab07 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp
@@ -164,7 +164,7 @@ void Nonlocal_maths<FPTYPE, Device>::cal_ylm(int lmax, int npw, const FPTYPE* q,
         // calculate
         ModuleBase::YlmReal::Ylm_Real(cpu_ctx, ntot_ylm, npw, q, ylm_cpu.data());
         // send from cpu to gpu
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, ylm, ylm_cpu.data(), ylm_cpu.size());
+        syncmem_var_h2d_op()(ylm, ylm_cpu.data(), ylm_cpu.size());
     }
     else
     {
@@ -193,7 +193,7 @@ void Nonlocal_maths<FPTYPE, Device>::cal_ylm_deri(int lmax, int npw, const FPTYP
             Nonlocal_maths<FPTYPE, Device>::dylmr2(ntot_ylm, npw, q, &dylmdq_cpu[ipol * ntot_ylm * npw], ipol);
         }
         // send from cpu to gpu
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, out, dylmdq_cpu.data(), dylmdq_cpu.size());
+        syncmem_var_h2d_op()(out, dylmdq_cpu.data(), dylmdq_cpu.size());
     }
     else
     {
diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp
index d4b7e51b65..e15793cbdc 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp
@@ -192,29 +192,29 @@ void Onsite_Proj_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix
     // allocate the memory for vkb and vkb_deri.
     if (this->device == base_device::GpuDevice)
     {
-        resmem_int_op()(this->ctx, this->d_dvkb_indexes, max_nh * 4);
+        resmem_int_op()(this->d_dvkb_indexes, max_nh * 4);
     }
 
-    resmem_var_op()(this->ctx, this->hd_vq, nprojmax * max_npw);
-    resmem_var_op()(this->ctx, this->hd_vq_deri, nprojmax * max_npw);
-    resmem_var_op()(this->ctx, this->hd_ylm, (lprojmax + 1) * (lprojmax + 1) * max_npw);
-    resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (lprojmax + 1) * (lprojmax + 1) * max_npw);
+    resmem_var_op()(this->hd_vq, nprojmax * max_npw);
+    resmem_var_op()(this->hd_vq_deri, nprojmax * max_npw);
+    resmem_var_op()(this->hd_ylm, (lprojmax + 1) * (lprojmax + 1) * max_npw);
+    resmem_var_op()(this->hd_ylm_deri, 3 * (lprojmax + 1) * (lprojmax + 1) * max_npw);
 
     if (this->device == base_device::GpuDevice)
     {
-        resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc);
-        resmem_var_op()(this->ctx, d_ekb, ekb.nr * ekb.nc);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, ekb.c, ekb.nr * ekb.nc);
-        resmem_int_op()(this->ctx, atom_nh, this->ntype);
-        resmem_int_op()(this->ctx, atom_na, this->ntype);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_na, h_atom_na.data(), this->ntype);
+        resmem_var_op()(d_wg, wg.nr * wg.nc);
+        resmem_var_op()(d_ekb, ekb.nr * ekb.nc);
+        syncmem_var_h2d_op()(d_wg, wg.c, wg.nr * wg.nc);
+        syncmem_var_h2d_op()(d_ekb, ekb.c, ekb.nr * ekb.nc);
+        resmem_int_op()(atom_nh, this->ntype);
+        resmem_int_op()(atom_na, this->ntype);
+        syncmem_int_h2d_op()(atom_nh, h_atom_nh.data(), this->ntype);
+        syncmem_int_h2d_op()(atom_na, h_atom_na.data(), this->ntype);
 
-        resmem_var_op()(this->ctx, d_g_plus_k, max_npw * 5);
-        resmem_var_op()(this->ctx, d_pref, max_nh);
-        resmem_var_op()(this->ctx, d_vq_tab, this->tabtpr->getSize());
-        resmem_complex_op()(this->ctx, d_pref_in, max_nh);
+        resmem_var_op()(d_g_plus_k, max_npw * 5);
+        resmem_var_op()(d_pref, max_nh);
+        resmem_var_op()(d_vq_tab, this->tabtpr->getSize());
+        resmem_complex_op()(d_pref_in, max_nh);
     }
     else
     {
@@ -230,39 +230,39 @@ void Onsite_Proj_tools<FPTYPE, Device>::delete_memory()
 {
     // delete memory
 
-    delmem_var_op()(this->ctx, hd_vq);
-    delmem_var_op()(this->ctx, hd_vq_deri);
-    delmem_var_op()(this->ctx, hd_ylm);
-    delmem_var_op()(this->ctx, hd_ylm_deri);
+    delmem_var_op()(hd_vq);
+    delmem_var_op()(hd_vq_deri);
+    delmem_var_op()(hd_ylm);
+    delmem_var_op()(hd_ylm_deri);
 
     // delete memory on GPU
     if (this->device == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, d_wg);
-        delmem_var_op()(this->ctx, d_ekb);
-        delmem_int_op()(this->ctx, atom_nh);
-        delmem_int_op()(this->ctx, atom_na);
-        delmem_var_op()(this->ctx, d_g_plus_k);
-        delmem_var_op()(this->ctx, d_pref);
-        delmem_var_op()(this->ctx, d_vq_tab);
-        delmem_complex_op()(this->ctx, this->d_pref_in);
-        delmem_int_op()(this->ctx, d_dvkb_indexes);
+        delmem_var_op()(d_wg);
+        delmem_var_op()(d_ekb);
+        delmem_int_op()(atom_nh);
+        delmem_int_op()(atom_na);
+        delmem_var_op()(d_g_plus_k);
+        delmem_var_op()(d_pref);
+        delmem_var_op()(d_vq_tab);
+        delmem_complex_op()(this->d_pref_in);
+        delmem_int_op()(d_dvkb_indexes);
     }
 
     if (becp != nullptr)
     {
-        delmem_complex_op()(this->ctx, becp);
-        delmem_complex_op()(this->ctx, hd_sk);
+        delmem_complex_op()(becp);
+        delmem_complex_op()(hd_sk);
     }
     if (dbecp != nullptr)
     {
-        delmem_complex_op()(this->ctx, dbecp);
+        delmem_complex_op()(dbecp);
     }
     if (this->pre_ik_f != -1)
     {
-        delmem_int_op()(this->ctx, gcar_zero_indexes);
-        delmem_complex_op()(this->ctx, vkb_save);
-        delmem_var_op()(this->ctx, gcar);
+        delmem_int_op()(gcar_zero_indexes);
+        delmem_complex_op()(vkb_save);
+        delmem_var_op()(gcar);
     }
 }
 
@@ -288,7 +288,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
     const int npw = this->wfc_basis_->npwk[ik];
     if (becp_in == nullptr && this->becp == nullptr)
     {
-        resmem_complex_op()(this->ctx, becp, this->nbands * npol * this->nkb);
+        resmem_complex_op()(becp, this->nbands * npol * this->nkb);
     }
     std::complex<FPTYPE>* becp_tmp = becp_in == nullptr ? this->becp : becp_in;
     const int size_becp_act = npm * npol * this->nkb;
@@ -297,7 +297,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
         const int size_becp = this->nbands * npol * this->nkb;
         if (this->becp == nullptr)
         {
-            resmem_complex_op()(this->ctx, becp, size_becp);
+            resmem_complex_op()(becp, size_becp);
         }
 
         // prepare math tools
@@ -311,7 +311,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
         // vq_tb has dimension (ntype, nproj, GlobalV::NQX)
 
         // calculate sk
-        resmem_complex_op()(ctx, hd_sk, this->ucell_->nat * npw);
+        resmem_complex_op()(hd_sk, this->ucell_->nat * npw);
         this->sf_->get_sk(ctx, ik, this->wfc_basis_, hd_sk);
         std::complex<FPTYPE>* d_sk = this->hd_sk;
         // prepare ylm，size: (lmax+1)^2 * this->max_npw
@@ -347,8 +347,8 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
 
         if (this->device == base_device::GpuDevice)
         {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_g_plus_k, g_plus_k.data(), g_plus_k.size());
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vq_tab, this->tabtpr->ptr, this->tabtpr->getSize());
+            syncmem_var_h2d_op()(d_g_plus_k, g_plus_k.data(), g_plus_k.size());
+            syncmem_var_h2d_op()(d_vq_tab, this->tabtpr->ptr, this->tabtpr->getSize());
             gk = d_g_plus_k;
             vq_tb = d_vq_tab;
         }
@@ -390,8 +390,8 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
 
             if (this->device == base_device::GpuDevice)
             {
-                syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
-                syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh);
+                syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
+                syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh);
             }
 
             for (int ia = 0; ia < h_atom_na[it]; ia++)
@@ -443,11 +443,11 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
     if (this->device == base_device::GpuDevice)
     {
         std::complex<FPTYPE>* h_becp = nullptr;
-        resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act);
-        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp_tmp, size_becp_act);
+        resmem_complex_h_op()(h_becp, size_becp_act);
+        syncmem_complex_d2h_op()(h_becp, becp_tmp, size_becp_act);
         Parallel_Reduce::reduce_pool(h_becp, size_becp_act);
-        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp_tmp, h_becp, size_becp_act);
-        delmem_complex_h_op()(this->cpu_ctx, h_becp);
+        syncmem_complex_h2d_op()(becp_tmp, h_becp, size_becp_act);
+        delmem_complex_h_op()(h_becp);
     }
     else
     {
@@ -474,7 +474,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
     const int npm_npol = npm * npol;
     if (this->dbecp == nullptr)
     {
-        resmem_complex_op()(this->ctx, dbecp, size_becp);
+        resmem_complex_op()(dbecp, size_becp);
     }
 
     // prepare math tools
@@ -540,8 +540,8 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, i
                              this->dvkb_indexes.data());
         if (this->device == base_device::GpuDevice)
         {
-            syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
-            syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh);
+            syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
+            syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh);
         }
         for (int ia = 0; ia < h_atom_na[it]; ia++)
         {
@@ -613,8 +613,8 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
     // calculate gcarx, gcary/gcarx and gcarz/gcary, overwrite gcar
     if (this->pre_ik_f == -1) // if it is the very first run, we allocate
     {
-        resmem_var_op()(this->ctx, gcar, 3 * this->wfc_basis_->npwk_max);
-        resmem_int_op()(this->ctx, gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max);
+        resmem_var_op()(gcar, 3 * this->wfc_basis_->npwk_max);
+        resmem_int_op()(gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max);
     }
     // first refresh the value of gcar_zero_indexes, gcar_zero_counts
     if (this->pre_ik_f != ik)
@@ -647,7 +647,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
     const int size_becp = this->nbands * npol * this->nkb;
     if (this->dbecp == nullptr) // if it is the very first run, we allocate
     {                           // why not judging whether dbecp == nullptr inside resmem_complex_op?
-        resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
+        resmem_complex_op()(dbecp, 3 * size_becp);
     }
     // do gemm to get dbecp and revert the ppcell_vkb for next ipol
     const std::complex<FPTYPE>* ppsi = &(this->psi_[0](ik, 0, 0));
@@ -799,10 +799,10 @@ void Onsite_Proj_tools<FPTYPE, Device>::transfer_gcar(int npw, int npw_max, cons
     }
     // prepare the memory for vkb_save
     const int max_count = std::max(gcar_zero_counts[0], std::max(gcar_zero_counts[1], gcar_zero_counts[2]));
-    resmem_complex_op()(this->ctx, this->vkb_save, this->nkb * max_count);
+    resmem_complex_op()(this->vkb_save, this->nkb * max_count);
     // transfer the gcar and gcar_zero_indexes to the device
-    syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar, gcar_tmp.data(), 3 * npw_max);
-    syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max);
+    syncmem_var_h2d_op()(gcar, gcar_tmp.data(), 3 * npw_max);
+    syncmem_int_h2d_op()(gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max);
 }
 
 template <typename FPTYPE, typename Device>
@@ -819,11 +819,11 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dftu(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype);
-        syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype);
-        resmem_complex_op()(this->ctx, vu_tmp, size_vu);
-        syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+        resmem_int_op()(orbital_corr_tmp, this->ucell_->ntype);
+        syncmem_int_h2d_op()(orbital_corr_tmp, orbital_corr, this->ucell_->ntype);
+        resmem_complex_op()(vu_tmp, size_vu);
+        syncmem_complex_h2d_op()(vu_tmp, vu, size_vu);
+        syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1));
     }
     else
 #endif
@@ -853,8 +853,8 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dftu(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        delmem_complex_op()(this->ctx, vu_tmp);
-        delmem_int_op()(this->ctx, orbital_corr_tmp);
+        delmem_complex_op()(vu_tmp);
+        delmem_int_op()(orbital_corr_tmp);
     }
 #endif
 }
@@ -877,9 +877,9 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dspin(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+        resmem_var_op()(lambda_tmp, this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(lambda_tmp, lambda_array.data(), this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1));
     }
     else
 #endif
@@ -908,7 +908,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dspin(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, lambda_tmp);
+        delmem_var_op()(lambda_tmp);
     }
 #endif
 }
@@ -927,11 +927,11 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype);
-        syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype);
-        resmem_complex_op()(this->ctx, vu_tmp, size_vu);
-        syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+        resmem_int_op()(orbital_corr_tmp, this->ucell_->ntype);
+        syncmem_int_h2d_op()(orbital_corr_tmp, orbital_corr, this->ucell_->ntype);
+        resmem_complex_op()(vu_tmp, size_vu);
+        syncmem_complex_h2d_op()(vu_tmp, vu, size_vu);
+        syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1));
     }
     else
 #endif
@@ -957,8 +957,8 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        delmem_complex_op()(this->ctx, vu_tmp);
-        delmem_int_op()(this->ctx, orbital_corr_tmp);
+        delmem_complex_op()(vu_tmp);
+        delmem_int_op()(orbital_corr_tmp);
     }
 #endif
 }
@@ -981,9 +981,9 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dspin(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+        resmem_var_op()(lambda_tmp, this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(lambda_tmp, lambda_array.data(), this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1));
     }
     else
 #endif
@@ -1009,7 +1009,7 @@ void Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dspin(int ik,
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, lambda_tmp);
+        delmem_var_op()(lambda_tmp);
     }
 #endif
 }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp
index 2bb69dc131..f235df15e5 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp
@@ -173,7 +173,7 @@ void projectors::OnsiteProjector<T, Device>::init(const std::string& orbital_dir
             this->tot_nproj = itiaiprojm2irow_.size();
             this->npwx_ = this->pw_basis_->npwk_max;
             this->size_vproj = this->tot_nproj * this->npwx_;
-            resmem_complex_op()(this->ctx, this->tab_atomic_, this->size_vproj, "OnsiteP::tab_atomic_");
+            resmem_complex_op()(this->tab_atomic_, this->size_vproj, "OnsiteP::tab_atomic_");
         }
 
         delete this->fs_tools; // it is okay to delete nullptr
@@ -191,12 +191,12 @@ projectors::OnsiteProjector<T, Device>::~OnsiteProjector()
 {
     //delete[] becp;
     delete fs_tools;
-    delmem_complex_op()(this->ctx, this->tab_atomic_);
+    delmem_complex_op()(this->tab_atomic_);
     if(this->device == base_device::GpuDevice)
     {
-        delmem_complex_h_op()(this->cpu_ctx, this->h_becp);
+        delmem_complex_h_op()(this->h_becp);
     }
-    delmem_complex_op()(this->ctx, this->becp);
+    delmem_complex_op()(this->becp);
 
 }
 
@@ -390,10 +390,10 @@ void projectors::OnsiteProjector<T, Device>::overlap_proj_psi(
     if(this->becp == nullptr || this->size_becp < npm*this->tot_nproj)
     {
         this->size_becp = npm*this->tot_nproj;
-        resmem_complex_op()(this->ctx, this->becp, this->size_becp);
+        resmem_complex_op()(this->becp, this->size_becp);
         if(this->device == base_device::GpuDevice )
         {
-            resmem_complex_h_op()(this->cpu_ctx, this->h_becp, this->size_becp);
+            resmem_complex_h_op()(this->h_becp, this->size_becp);
         }
         else
         {
@@ -403,7 +403,7 @@ void projectors::OnsiteProjector<T, Device>::overlap_proj_psi(
     this->fs_tools->cal_becp(ik_, npm/npol, this->becp, ppsi); // in cal_becp, npm should be the one not multiplied by npol
     if(this->device == base_device::GpuDevice)
     {
-        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, this->becp, this->size_becp);
+        syncmem_complex_d2h_op()(h_becp, this->becp, this->size_becp);
     }
     ModuleBase::timer::tick("OnsiteProj", "overlap");
 }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp
index b0372109dc..dc8a566d05 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp
@@ -27,14 +27,14 @@ Meta<OperatorPW<T, Device>>::Meta(Real tpiba_in,
     this->vk_row = vk_row;
     this->vk_col = vk_col;
     this->wfcpw = wfcpw_in;
-    resmem_complex_op()(this->ctx, this->porter, this->wfcpw->nmaxgr, "Meta<PW>::porter");
+    resmem_complex_op()(this->porter, this->wfcpw->nmaxgr, "Meta<PW>::porter");
 
 }
 
 template<typename T, typename Device>
 Meta<OperatorPW<T, Device>>::~Meta()
 {
-    delmem_complex_op()(this->ctx, this->porter);
+    delmem_complex_op()(this->porter);
 }
 
 template<typename T, typename Device>
@@ -55,7 +55,7 @@ void Meta<OperatorPW<T, Device>>::act(
     ModuleBase::timer::tick("Operator", "MetaPW");
     if(is_first_node)
     {
-        setmem_complex_op()(this->ctx, tmhpsi, 0, nbasis*nbands/npol);
+        setmem_complex_op()(tmhpsi, 0, nbasis*nbands/npol);
     }
 
     const int current_spin = this->isk[this->ik];
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp
index 563e9d23a0..7446151d36 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp
@@ -35,8 +35,8 @@ Nonlocal<OperatorPW<T, Device>>::Nonlocal(const int* isk_in,
 
 template<typename T, typename Device>
 Nonlocal<OperatorPW<T, Device>>::~Nonlocal() {
-    delmem_complex_op()(this->ctx, this->ps);
-    delmem_complex_op()(this->ctx, this->becp);
+    delmem_complex_op()(this->ps);
+    delmem_complex_op()(this->becp);
 }
 
 template<typename T, typename Device>
@@ -72,10 +72,10 @@ void Nonlocal<OperatorPW<T, Device>>::add_nonlocal_pp(T *hpsi_in, const T *becp,
     // T *ps = new T[nkb * m];
     // ModuleBase::GlobalFunc::ZEROS(ps, m * nkb);
     if (this->nkb_m < m * nkb) {
-        resmem_complex_op()(this->ctx, this->ps, nkb * m, "Nonlocal<PW>::ps");
+        resmem_complex_op()(this->ps, nkb * m, "Nonlocal<PW>::ps");
         this->nkb_m = m * nkb;
     }
-    setmem_complex_op()(this->ctx, this->ps, 0, nkb * m);
+    setmem_complex_op()(this->ps, 0, nkb * m);
 
     int sum = 0;
     int iat = 0;
@@ -221,7 +221,7 @@ void Nonlocal<OperatorPW<T, Device>>::act(
     ModuleBase::timer::tick("Operator", "NonlocalPW");
     if(is_first_node)
     {
-        setmem_complex_op()(this->ctx, tmhpsi, 0, nbasis*nbands/npol);
+        setmem_complex_op()(tmhpsi, 0, nbasis*nbands/npol);
     }
     if(!PARAM.inp.use_paw)
     {
@@ -235,7 +235,7 @@ void Nonlocal<OperatorPW<T, Device>>::act(
             // qianrui optimize 2021-3-31
             int nkb = this->ppcell->nkb;
             if (this->nkb_m < nbands * nkb) {
-                resmem_complex_op()(this->ctx, this->becp, nbands * nkb, "Nonlocal<PW>::becp");
+                resmem_complex_op()(this->becp, nbands * nkb, "Nonlocal<PW>::becp");
             }
             // ModuleBase::ComplexMatrix becp(nbands, nkb, false);
             char transa = 'C';
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp
index 39f0c1458a..3cfd345356 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp
@@ -30,22 +30,22 @@ OnsiteProj<OperatorPW<T, Device>>::OnsiteProj(const int* isk_in,
 
 template<typename T, typename Device>
 OnsiteProj<OperatorPW<T, Device>>::~OnsiteProj() {
-    delmem_complex_op()(this->ctx, this->ps);
+    delmem_complex_op()(this->ps);
     if(this->init_delta_spin)
     {
-        delmem_int_op()(this->ctx, this->ip_iat);
-        delmem_complex_op()(this->ctx, this->lambda_coeff);
+        delmem_int_op()(this->ip_iat);
+        delmem_complex_op()(this->lambda_coeff);
     }
     if(this->has_dftu)
     {
         if(!init_delta_spin)
         {
-            delmem_int_op()(this->ctx, this->ip_iat);
+            delmem_int_op()(this->ip_iat);
         }
-        delmem_int_op()(this->ctx, this->orb_l_iat);
-        delmem_int_op()(this->ctx, this->ip_m);
-        delmem_int_op()(this->ctx, this->vu_begin_iat);
-        delmem_complex_op()(this->ctx, this->vu_device);
+        delmem_int_op()(this->orb_l_iat);
+        delmem_int_op()(this->ip_m);
+        delmem_int_op()(this->vu_begin_iat);
+        delmem_complex_op()(this->vu_device);
     }
 }
 
@@ -127,17 +127,17 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_delta_spin(const int npol, const
     // T *ps = new T[tnp * m];
     // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp);
     if (this->nkb_m < m * tnp) {
-        resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj<PW>::ps");
+        resmem_complex_op()(this->ps, tnp * m, "OnsiteProj<PW>::ps");
         this->nkb_m = m * tnp;
     }
-    setmem_complex_op()(this->ctx, this->ps, 0, tnp * m);
+    setmem_complex_op()(this->ps, 0, tnp * m);
 
     if(!this->init_delta_spin)
     {
         this->init_delta_spin = true;
         //prepare ip_iat and lambda_coeff
-        resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj());
-        resmem_complex_op()(this->ctx, this->lambda_coeff, this->ucell->nat * 4);
+        resmem_int_op()(this->ip_iat, onsite_p->get_tot_nproj());
+        resmem_complex_op()(this->lambda_coeff, this->ucell->nat * 4);
         std::vector<int> ip_iat0(onsite_p->get_tot_nproj());
         int ip0 = 0;
         for(int iat=0;iat<this->ucell->nat;iat++)
@@ -147,7 +147,7 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_delta_spin(const int npol, const
                 ip_iat0[ip0++] = iat;
             }
         }
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
+        syncmem_int_h2d_op()(this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
     }
 
     // prepare array of nh_iat and lambda_array to pass to the onsite_ps_op operator
@@ -159,7 +159,7 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_delta_spin(const int npol, const
         tmp_lambda_coeff[iat * 4 + 2] = std::complex<double>(lambda[iat][0], -1 * lambda[iat][1]);
         tmp_lambda_coeff[iat * 4 + 3] = std::complex<double>(-1 * lambda[iat][2], 0.0);
     }
-    syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->lambda_coeff, tmp_lambda_coeff.data(), this->ucell->nat * 4);
+    syncmem_complex_h2d_op()(this->lambda_coeff, tmp_lambda_coeff.data(), this->ucell->nat * 4);
     // TODO: code block above should be moved to the init function
 
     hamilt::onsite_ps_op<Real, Device>()(
@@ -225,23 +225,23 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_dftu(const int npol, const int m)
     // T *ps = new T[tnp * m];
     // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp);
     if (this->nkb_m < m * tnp) {
-        resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj<PW>::ps");
+        resmem_complex_op()(this->ps, tnp * m, "OnsiteProj<PW>::ps");
         this->nkb_m = m * tnp;
     }
     if(!this->has_delta_spin) 
     {
-        setmem_complex_op()(this->ctx, this->ps, 0, tnp * m);
+        setmem_complex_op()(this->ps, 0, tnp * m);
     }
 
     if(!this->init_dftu)
     {
         this->init_dftu = true;
         //prepare orb_l_iat, ip_m, vu_begin_iat and vu_device
-        resmem_int_op()(this->ctx, this->orb_l_iat, this->ucell->nat);
-        resmem_int_op()(this->ctx, this->ip_m, onsite_p->get_tot_nproj());
-        resmem_int_op()(this->ctx, this->vu_begin_iat, this->ucell->nat);
+        resmem_int_op()(this->orb_l_iat, this->ucell->nat);
+        resmem_int_op()(this->ip_m, onsite_p->get_tot_nproj());
+        resmem_int_op()(this->vu_begin_iat, this->ucell->nat);
         // recal the ip_iat
-        resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj());
+        resmem_int_op()(this->ip_iat, onsite_p->get_tot_nproj());
         std::vector<int> ip_iat0(onsite_p->get_tot_nproj());
         std::vector<int> ip_m0(onsite_p->get_tot_nproj());
         std::vector<int> vu_begin_iat0(this->ucell->nat);
@@ -285,15 +285,15 @@ void OnsiteProj<OperatorPW<T, Device>>::cal_ps_dftu(const int npol, const int m)
                 }
             }
         }
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat);
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj());
-        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat);
+        syncmem_int_h2d_op()(this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat);
+        syncmem_int_h2d_op()(this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
+        syncmem_int_h2d_op()(this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj());
+        syncmem_int_h2d_op()(this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat);
 
-        resmem_complex_op()(this->ctx, this->vu_device, dftu->get_size_eff_pot_pw());
+        resmem_complex_op()(this->vu_device, dftu->get_size_eff_pot_pw());
     }
 
-    syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->vu_device, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw());
+    syncmem_complex_h2d_op()(this->vu_device, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw());
 
     hamilt::onsite_ps_op<Real, Device>()(
         this->ctx,   // device context
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp
index 2343ee7ecb..6bff6b2dc0 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp
@@ -23,16 +23,16 @@ Veff<OperatorPW<T, Device>>::Veff(const int* isk_in,
     this->veff_row = veff_row;
     this->veff_col = veff_col;
     this->wfcpw = wfcpw_in;
-    resmem_complex_op()(this->ctx, this->porter, this->wfcpw->nmaxgr, "Veff<PW>::porter");
-    resmem_complex_op()(this->ctx, this->porter1, this->wfcpw->nmaxgr, "Veff<PW>::porter1");
+    resmem_complex_op()(this->porter, this->wfcpw->nmaxgr, "Veff<PW>::porter");
+    resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr, "Veff<PW>::porter1");
 
 }
 
 template<typename T, typename Device>
 Veff<OperatorPW<T, Device>>::~Veff()
 {
-    delmem_complex_op()(this->ctx, this->porter);
-    delmem_complex_op()(this->ctx, this->porter1);
+    delmem_complex_op()(this->porter);
+    delmem_complex_op()(this->porter1);
 }
 
 template<typename T, typename Device>
@@ -48,7 +48,7 @@ void Veff<OperatorPW<T, Device>>::act(
     ModuleBase::timer::tick("Operator", "VeffPW");
     if(is_first_node)
     {
-        setmem_complex_op()(this->ctx, tmhpsi, 0, nbasis*nbands/npol);
+        setmem_complex_op()(tmhpsi, 0, nbasis*nbands/npol);
     }
 
     int max_npw = nbasis / npol;
@@ -124,8 +124,8 @@ hamilt::Veff<OperatorPW<T, Device>>::Veff(const Veff<OperatorPW<T_in, Device_in>
     this->veff_col = veff->get_veff_col();
     this->veff_row = veff->get_veff_row();
     this->wfcpw = veff->get_wfcpw();
-    resmem_complex_op()(this->ctx, this->porter, this->wfcpw->nmaxgr);
-    resmem_complex_op()(this->ctx, this->porter1, this->wfcpw->nmaxgr);
+    resmem_complex_op()(this->porter, this->wfcpw->nmaxgr);
+    resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr);
     this->veff = veff->get_veff();
     if (this->isk == nullptr || this->veff == nullptr || this->wfcpw == nullptr) {
         ModuleBase::WARNING_QUIT("VeffPW", "Constuctor of Operator::VeffPW is failed, please check your code!");
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp
index ab8d9b3fa1..bbdefb737a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp
@@ -289,35 +289,35 @@ void Stress_Func<FPTYPE, Device>::deriv_drhoc
 	double *aux_d = nullptr;
 	double *drhocg_d = nullptr;
 	if(this->device == base_device::GpuDevice ) {
-		resmem_var_op()(this->ctx, r_d, mesh);
-		resmem_var_op()(this->ctx, rhoc_d, mesh);
-		resmem_var_op()(this->ctx, rab_d, mesh);
-
-		resmem_var_op()(this->ctx, aux_d, mesh);
-		resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg);
-		resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg);
-
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gx_arr_d, gx_arr.data(), rho_basis->ngg);
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, mesh);
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, mesh);
-		syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, rhoc, mesh);
+		resmem_var_op()(r_d, mesh);
+		resmem_var_op()(rhoc_d, mesh);
+		resmem_var_op()(rab_d, mesh);
+
+		resmem_var_op()(aux_d, mesh);
+		resmem_var_op()(gx_arr_d, rho_basis->ngg);
+		resmem_var_op()(drhocg_d, rho_basis->ngg);
+
+		syncmem_var_h2d_op()(gx_arr_d, gx_arr.data(), rho_basis->ngg);
+		syncmem_var_h2d_op()(r_d, r, mesh);
+		syncmem_var_h2d_op()(rab_d, rab, mesh);
+		syncmem_var_h2d_op()(rhoc_d, rhoc, mesh);
 	}
 
 	if(this->device == base_device::GpuDevice) {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
 			r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,omega,type);
-		syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
+		syncmem_var_d2h_op()(drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
 
 	} else {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
 			r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,omega,type);
 
 	}
-    delmem_var_op()(this->ctx, r_d);
-    delmem_var_op()(this->ctx, rhoc_d);
-    delmem_var_op()(this->ctx, rab_d);
-    delmem_var_op()(this->ctx, gx_arr_d);
-    delmem_var_op()(this->ctx, drhocg_d);
+    delmem_var_op()(r_d);
+    delmem_var_op()(rhoc_d);
+    delmem_var_op()(rab_d);
+    delmem_var_op()(gx_arr_d);
+    delmem_var_op()(drhocg_d);
 	return;
 }
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
index 740d692c39..42e619c9bc 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp
@@ -244,22 +244,20 @@ const UnitCell& ucell_in
     double *aux_d = nullptr;
 	double *drhocg_d = nullptr;
     if (this->device == base_device::GpuDevice) {
-        resmem_var_op()(this->ctx, r_d, msh);
-        resmem_var_op()(this->ctx, rhoc_d, msh);
-        resmem_var_op()(this->ctx, rab_d, msh);
+        resmem_var_op()(r_d, msh);
+        resmem_var_op()(rhoc_d, msh);
+        resmem_var_op()(rab_d, msh);
 
-        resmem_var_op()(this->ctx, aux_d, msh);
-        resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg+1);
-        resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg);
+        resmem_var_op()(aux_d, msh);
+        resmem_var_op()(gx_arr_d, rho_basis->ngg+1);
+        resmem_var_op()(drhocg_d, rho_basis->ngg);
 
-        syncmem_var_h2d_op()(this->ctx,
-                             this->cpu_ctx,
-                             gx_arr_d,
+        syncmem_var_h2d_op()(gx_arr_d,
                              gx_arr.data(),
                              rho_basis->ngg+1);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, msh);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, msh);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, aux.data(), msh);
+        syncmem_var_h2d_op()(r_d, r, msh);
+        syncmem_var_h2d_op()(rab_d, rab, msh);
+        syncmem_var_h2d_op()(rhoc_d, aux.data(), msh);
     }
 
 
@@ -267,7 +265,7 @@ const UnitCell& ucell_in
 	if(this->device == base_device::GpuDevice) {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
 			r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3);
-		syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, dvloc+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
+		syncmem_var_d2h_op()(dvloc+igl0, drhocg_d+igl0, rho_basis->ngg-igl0);	
 
 	} else {
 		hamilt::cal_stress_drhoc_aux_op<FPTYPE, Device>()(
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
index 73b9e08a82..1af82ba153 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp
@@ -30,8 +30,8 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
     ModuleBase::timer::tick("Stress_Func", "stress_nl");
 
     FPTYPE* stress_device = nullptr;
-    resmem_var_op()(this->ctx, stress_device, 9);
-    setmem_var_op()(this->ctx, stress_device, 0, 9);
+    resmem_var_op()(stress_device, 9);
+    setmem_var_op()(stress_device, 0, 9);
     std::vector<FPTYPE> sigmanlc(9, 0.0);
 
     hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(&nlpp_in, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb);
@@ -69,8 +69,8 @@ void Stress_Func<FPTYPE, Device>::stress_nl(ModuleBase::matrix& sigma,
         }
     }
     // transfer stress from device to host
-    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigmanlc.data(), stress_device, 9);
-    delmem_var_op()(this->ctx, stress_device);
+    syncmem_var_d2h_op()(sigmanlc.data(), stress_device, 9);
+    delmem_var_op()(stress_device);
     // sum up forcenl from all processors
     for (int l = 0; l < 3; l++)
     {
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
index 8568821a10..acce052e83 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
@@ -22,8 +22,8 @@ void Stress_Func<FPTYPE, Device>::stress_onsite(ModuleBase::matrix& sigma,
     ModuleBase::timer::tick("Stress_Func", "stress_onsite");
 
     FPTYPE* stress_device = nullptr;
-    resmem_var_op()(this->ctx, stress_device, 9);
-    setmem_var_op()(this->ctx, stress_device, 0, 9);
+    resmem_var_op()(stress_device, 9);
+    setmem_var_op()(stress_device, 0, 9);
     std::vector<FPTYPE> sigma_onsite(9, 0.0);
 
     auto* onsite_p = projectors::OnsiteProjector<FPTYPE, Device>::get_instance();
@@ -68,8 +68,8 @@ void Stress_Func<FPTYPE, Device>::stress_onsite(ModuleBase::matrix& sigma,
         }
     }
     // transfer stress from device to host
-    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigma_onsite.data(), stress_device, 9);
-    delmem_var_op()(this->ctx, stress_device);
+    syncmem_var_d2h_op()(sigma_onsite.data(), stress_device, 9);
+    delmem_var_op()(stress_device);
     // sum up forcenl from all processors
     for (int l = 0; l < 3; l++)
     {
diff --git a/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp b/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp
index 24dcbe27ce..4e328c1fda 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp
@@ -27,19 +27,19 @@ Structure_Factor::~Structure_Factor()
 {
     if (device == "gpu") {
         if (PARAM.inp.precision == "single") {
-            delmem_cd_op()(gpu_ctx, this->c_eigts1);
-            delmem_cd_op()(gpu_ctx, this->c_eigts2);
-            delmem_cd_op()(gpu_ctx, this->c_eigts3);
+            delmem_cd_op()(this->c_eigts1);
+            delmem_cd_op()(this->c_eigts2);
+            delmem_cd_op()(this->c_eigts3);
         }
-        delmem_zd_op()(gpu_ctx, this->z_eigts1);
-        delmem_zd_op()(gpu_ctx, this->z_eigts2);
-        delmem_zd_op()(gpu_ctx, this->z_eigts3);
+        delmem_zd_op()(this->z_eigts1);
+        delmem_zd_op()(this->z_eigts2);
+        delmem_zd_op()(this->z_eigts3);
     }
     else {
         if (PARAM.inp.precision == "single") {
-            delmem_ch_op()(cpu_ctx, this->c_eigts1);
-            delmem_ch_op()(cpu_ctx, this->c_eigts2);
-            delmem_ch_op()(cpu_ctx, this->c_eigts3);
+            delmem_ch_op()(this->c_eigts1);
+            delmem_ch_op()(this->c_eigts2);
+            delmem_ch_op()(this->c_eigts3);
         }
         // There's no need to delete double precision pointers while in a CPU environment.
     }
@@ -151,28 +151,28 @@ void Structure_Factor::setup_structure_factor(const UnitCell* Ucell, const Paral
     }
     if (device == "gpu") {
         if (PARAM.inp.precision == "single") {
-            resmem_cd_op()(gpu_ctx, this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1));
-            resmem_cd_op()(gpu_ctx, this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1));
-            resmem_cd_op()(gpu_ctx, this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1));
-            castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1));
-            castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1));
-            castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1));
+            resmem_cd_op()(this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1));
+            resmem_cd_op()(this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1));
+            resmem_cd_op()(this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1));
+            castmem_z2c_h2d_op()(this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1));
+            castmem_z2c_h2d_op()(this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1));
+            castmem_z2c_h2d_op()(this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1));
         }
-        resmem_zd_op()(gpu_ctx, this->z_eigts1, Ucell->nat * (2 * rho_basis->nx + 1));
-        resmem_zd_op()(gpu_ctx, this->z_eigts2, Ucell->nat * (2 * rho_basis->ny + 1));
-        resmem_zd_op()(gpu_ctx, this->z_eigts3, Ucell->nat * (2 * rho_basis->nz + 1));
-        syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1));
-        syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1));
-        syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1));
+        resmem_zd_op()(this->z_eigts1, Ucell->nat * (2 * rho_basis->nx + 1));
+        resmem_zd_op()(this->z_eigts2, Ucell->nat * (2 * rho_basis->ny + 1));
+        resmem_zd_op()(this->z_eigts3, Ucell->nat * (2 * rho_basis->nz + 1));
+        syncmem_z2z_h2d_op()(this->z_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1));
+        syncmem_z2z_h2d_op()(this->z_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1));
+        syncmem_z2z_h2d_op()(this->z_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1));
     }
     else {
         if (PARAM.inp.precision == "single") {
-            resmem_ch_op()(cpu_ctx, this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1));
-            resmem_ch_op()(cpu_ctx, this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1));
-            resmem_ch_op()(cpu_ctx, this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1));
-            castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1));
-            castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1));
-            castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1));
+            resmem_ch_op()(this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1));
+            resmem_ch_op()(this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1));
+            resmem_ch_op()(this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1));
+            castmem_z2c_h2h_op()(this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1));
+            castmem_z2c_h2h_op()(this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1));
+            castmem_z2c_h2h_op()(this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1));
         }
         this->z_eigts1 = this->eigts1.c;
         this->z_eigts2 = this->eigts2.c;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp b/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp
index add76f6fb3..bca92ac1cf 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp
@@ -91,11 +91,11 @@ void Structure_Factor::get_sk(Device* ctx,
     }
     if (device == base_device::GpuDevice)
     {
-        resmem_int_op()(ctx, atom_na, ucell->ntype);
-        syncmem_int_op()(ctx, cpu_ctx, atom_na, h_atom_na, ucell->ntype);
+        resmem_int_op()(atom_na, ucell->ntype);
+        syncmem_int_op()(atom_na, h_atom_na, ucell->ntype);
 
-        resmem_var_op()(ctx, atom_tau, ucell->nat * 3);
-        syncmem_var_op()(ctx, cpu_ctx, atom_tau, h_atom_tau, ucell->nat * 3);
+        resmem_var_op()(atom_tau, ucell->nat * 3);
+        syncmem_var_op()(atom_tau, h_atom_tau, ucell->nat * 3);
 
         igl2isz = wfc_basis->d_igl2isz_k;
         is2fftixy = wfc_basis->d_is2fftixy;
@@ -135,8 +135,8 @@ void Structure_Factor::get_sk(Device* ctx,
                 sk);
     if (device == base_device::GpuDevice)
     {
-        delmem_int_op()(ctx, atom_na);
-        delmem_var_op()(ctx, atom_tau);
+        delmem_int_op()(atom_na);
+        delmem_var_op()(atom_tau);
     }
     delete[] h_atom_na;
     delete[] h_atom_tau;
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp
index 23a5a18926..34e20977eb 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp
@@ -9,7 +9,7 @@ StoChe<REAL, Device>::~StoChe()
 {
     delete p_che;
     delete[] spolyv_cpu;
-    delmem_var_op()(this->ctx, spolyv);
+    delmem_var_op()(spolyv);
 }
 
 template <typename REAL, typename Device>
@@ -20,12 +20,12 @@ StoChe<REAL, Device>::StoChe(const int& nche, const int& method, const REAL& ema
     p_che = new ModuleBase::Chebyshev<REAL, Device>(nche);
     if (method == 1)
     {
-        resmem_var_op()(this->ctx, spolyv, nche);
+        resmem_var_op()(spolyv, nche);
         spolyv_cpu = new REAL[nche];
     }
     else
     {
-        resmem_var_op()(this->ctx, spolyv, nche * nche);
+        resmem_var_op()(spolyv, nche * nche);
     }
 
     this->emax_sto = emax_sto;
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_che.h b/source/module_hamilt_pw/hamilt_stodft/sto_che.h
index 3a7d2f0090..f241553b66 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_che.h
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_che.h
@@ -50,19 +50,17 @@ REAL vTMv(const REAL* v, const REAL* M, const int n)
     const int inc = 1;
     const REAL zero = 0;
     REAL* y = nullptr;
-    base_device::memory::resize_memory_op<REAL, Device>()(ctx, y, n);
+    base_device::memory::resize_memory_op<REAL, Device>()(y, n);
     hsolver::gemv_op<REAL, Device>()(ctx, normal, n, n, &one, M, n, v, inc, &zero, y, inc);
     REAL result = 0;
     REAL* dot_device = nullptr;
-    base_device::memory::resize_memory_op<REAL, Device>()(ctx, dot_device, 1);
+    base_device::memory::resize_memory_op<REAL, Device>()(dot_device, 1);
     container::kernels::blas_dot<REAL, ct_Device>()(n, y, 1, v, 1, dot_device);
-    base_device::memory::synchronize_memory_op<REAL, base_device::DEVICE_CPU, Device>()(cpu_ctx,
-                                                                                        ctx,
-                                                                                        &result,
+    base_device::memory::synchronize_memory_op<REAL, base_device::DEVICE_CPU, Device>()(&result,
                                                                                         dot_device,
                                                                                         1);
-    base_device::memory::delete_memory_op<REAL, Device>()(ctx, y);
-    base_device::memory::delete_memory_op<REAL, Device>()(ctx, dot_device);
+    base_device::memory::delete_memory_op<REAL, Device>()(y);
+    base_device::memory::delete_memory_op<REAL, Device>()(dot_device);
     return result;
 }
 
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
index db54e40db0..6684332781 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp
@@ -217,8 +217,8 @@ void Sto_Forces<FPTYPE, Device>::cal_sto_force_nl(
 
    // allocate memory for the force
     FPTYPE* force = nullptr;
-    resmem_var_op()(this->ctx, force, ucell.nat * 3);
-    base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, force, 0.0, ucell.nat * 3);
+    resmem_var_op()(force, ucell.nat * 3);
+    base_device::memory::set_memory_op<FPTYPE, Device>()(force, 0.0, ucell.nat * 3);
 
     hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr);
 
@@ -250,8 +250,8 @@ void Sto_Forces<FPTYPE, Device>::cal_sto_force_nl(
         nl_tools.cal_force(ik, max_nbands, nstobands, false, force, nksbands);
     } // end ik
 
-    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc);
-    delmem_var_op()(this->ctx, force);
+    syncmem_var_d2h_op()(forcenl.c, force, forcenl.nr * forcenl.nc);
+    delmem_var_op()(force);
     // sum up forcenl from all processors
     Parallel_Reduce::reduce_all(forcenl.c, forcenl.nr * forcenl.nc);
 
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp
index ec4aa26c1c..8ec669febd 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp
@@ -27,10 +27,10 @@ template <typename T, typename Device>
 void Stochastic_Iter<T, Device>::dot(const int& n, const Real* x, const int& incx, const Real* y, const int& incy, Real& result)
 {
     Real* result_device = nullptr;
-    resmem_var_op()(this->ctx, result_device, 1);
+    resmem_var_op()(result_device, 1);
     container::kernels::blas_dot<Real, ct_Device>()(n, p_che->coef_real, 1, spolyv, 1, result_device);
-    syncmem_var_d2h_op()(cpu_ctx, this->ctx, &result, result_device, 1);
-    delmem_var_op()(this->ctx, result_device);
+    syncmem_var_d2h_op()(&result, result_device, 1);
+    delmem_var_op()(result_device);
 }
 
 template <typename T, typename Device>
@@ -65,7 +65,7 @@ void Stochastic_Iter<T, Device>::orthog(const int& ik, psi::Psi<T, Device>& psi,
         stowf.chi0->fix_k(ik);
         stowf.chiortho->fix_k(ik);
         T *wfgin = stowf.chi0->get_pointer(), *wfgout = stowf.chiortho->get_pointer();
-        cpymem_complex_op()(this->ctx, this->ctx, wfgout, wfgin, npwx * nchipk);
+        cpymem_complex_op()(wfgout, wfgin, npwx * nchipk);
         // for (int ig = 0; ig < npwx * nchipk; ++ig)
         // {
         //     wfgout[ig] = wfgin[ig];
@@ -73,7 +73,7 @@ void Stochastic_Iter<T, Device>::orthog(const int& ik, psi::Psi<T, Device>& psi,
 
         // orthogonal part
         T* sum = nullptr;
-        resmem_complex_op()(this->ctx, sum, PARAM.inp.nbands * nchipk);
+        resmem_complex_op()(sum, PARAM.inp.nbands * nchipk);
         char transC = 'C';
         char transN = 'N';
 
@@ -109,7 +109,7 @@ void Stochastic_Iter<T, Device>::orthog(const int& ik, psi::Psi<T, Device>& psi,
                                       &ModuleBase::ONE,
                                       wfgout,
                                       npwx);
-        delmem_complex_op()(this->ctx, sum);
+        delmem_complex_op()(sum);
     }
     ModuleBase::timer::tick("Stochastic_Iter", "orthog");
 }
@@ -209,8 +209,8 @@ void Stochastic_Iter<T, Device>::check_precision(const double ref, const double
     {
         Real last_coef = 0;
         Real last_spolyv = 0;
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_coef, &p_che->coef_real[p_che->norder - 1], 1);
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_spolyv, &spolyv[p_che->norder - 1], 1);
+        syncmem_var_d2h_op()(&last_coef, &p_che->coef_real[p_che->norder - 1], 1);
+        syncmem_var_d2h_op()(&last_spolyv, &spolyv[p_che->norder - 1], 1);
         error = last_coef * last_spolyv;
     }
     else
@@ -220,8 +220,8 @@ void Stochastic_Iter<T, Device>::check_precision(const double ref, const double
         // double last_spolyv = spolyv[norder * norder - 1];
         Real last_coef = 0;
         Real last_spolyv = 0;
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_coef, &p_che->coef_real[norder - 1], 1);
-        syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_spolyv, &spolyv[norder * norder - 1], 1);
+        syncmem_var_d2h_op()(&last_coef, &p_che->coef_real[norder - 1], 1);
+        syncmem_var_d2h_op()(&last_spolyv, &spolyv[norder * norder - 1], 1);
         Real dot1 = 0, dot2 = 0;
         this->dot(norder, p_che->coef_real, 1, spolyv + norder * (norder - 1), 1, dot1);
         this->dot(norder, p_che->coef_real, 1, spolyv + norder - 1, norder, dot2);
@@ -362,7 +362,7 @@ void Stochastic_Iter<T, Device>::calPn(const int& ik, Stochastic_WF<T, Device>&
         }
         else
         {
-            setmem_var_op()(this->ctx, spolyv, 0, norder * norder);
+            setmem_var_op()(spolyv, 0, norder * norder);
         }
     }
     T* pchi;
@@ -391,7 +391,7 @@ void Stochastic_Iter<T, Device>::calPn(const int& ik, Stochastic_WF<T, Device>&
         }
         if(ik == this->pkv->get_nks() - 1)
         {
-            syncmem_var_h2d_op()(this->ctx, cpu_ctx, spolyv, spolyv_cpu, norder);
+            syncmem_var_h2d_op()(spolyv, spolyv_cpu, norder);
         }
     }
     else
@@ -539,7 +539,7 @@ void Stochastic_Iter<T, Device>::sum_stoeband(Stochastic_WF<T, Device>& stowf,
             const int npw = this->pkv->ngk[ik];
             const double kweight = this->pkv->wk[ik];
             T* hshchi = nullptr;
-            resmem_complex_op()(this->ctx, hshchi, nchip_ik * npwx);
+            resmem_complex_op()(hshchi, nchip_ik * npwx);
             T* tmpin = stowf.shchi->get_pointer();
             T* tmpout = hshchi;
             p_hamilt_sto->hPsi(tmpin, tmpout, nchip_ik);
@@ -549,7 +549,7 @@ void Stochastic_Iter<T, Device>::sum_stoeband(Stochastic_WF<T, Device>& stowf,
                 tmpin += npwx;
                 tmpout += npwx;
             }
-            delmem_complex_op()(this->ctx, hshchi);
+            delmem_complex_op()(hshchi);
         }
     }
 #ifdef __MPI
@@ -573,7 +573,7 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
     const int nspin = PARAM.inp.nspin;
 
     T* porter = nullptr;
-    resmem_complex_op()(this->ctx, porter, nrxx);
+    resmem_complex_op()(porter, nrxx);
 
     std::vector<double*> sto_rho(nspin);
     for(int is = 0; is < nspin; ++is)
@@ -597,7 +597,7 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
     }
     for (int is = 0; is < nspin; is++)
     {
-        setmem_var_op()(this->ctx, pes->rho[is], 0, nrxx);
+        setmem_var_op()(pes->rho[is], 0, nrxx);
     }
     for (int ik = 0; ik < this->pkv->get_nks(); ++ik)
     {
@@ -624,7 +624,7 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
     if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") {
         for(int is = 0; is < nspin; ++is)
         {
-            castmem_var_d2h_op()(this->cpu_ctx, this->ctx, sto_rho[is], pes->rho[is], nrxx);
+            castmem_var_d2h_op()(sto_rho[is], pes->rho[is], nrxx);
         }
     }
     else
@@ -633,7 +633,7 @@ void Stochastic_Iter<T, Device>::cal_storho(const UnitCell& ucell,
         pes->rho = reinterpret_cast<Real **>(pes->charge->rho);
     }
 
-    delmem_complex_op()(this->ctx, porter);
+    delmem_complex_op()(porter);
 #ifdef __MPI
     if(GlobalV::KPAR > 1)
     {
@@ -735,11 +735,11 @@ void Stochastic_Iter<T, Device>::calTnchi_ik(const int& ik, Stochastic_WF<T, Dev
         const int M = npwx * nchip[ik];
         const int N = p_che->norder;
         T* coef_real = nullptr;
-        resmem_complex_op()(this->ctx, coef_real, N);
-        castmem_d2z_op()(this->ctx, this->ctx, coef_real, p_che->coef_real, p_che->norder);
+        resmem_complex_op()(coef_real, N);
+        castmem_d2z_op()(coef_real, p_che->coef_real, p_che->norder);
         gemv_op()(this->ctx, transa, M, N, &one, stowf.chiallorder[ik].get_pointer(), LDA, coef_real, inc, &zero, out, inc);
         // zgemv_(&transa, &M, &N, &one, stowf.chiallorder[ik].get_pointer(), &LDA, coef_real, &inc, &zero, out, &inc);
-        delmem_complex_op()(this->ctx, coef_real);
+        delmem_complex_op()(coef_real);
     }
     else
     {
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp
index 5be294f2e7..62a4c16779 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp
@@ -167,8 +167,8 @@ void Sto_Stress_PW<FPTYPE, Device>::sto_stress_nl(ModuleBase::matrix& sigma,
 
     // allocate memory for the stress
     FPTYPE* stress_device = nullptr;
-    resmem_var_op()(this->ctx, stress_device, 9);
-    setmem_var_op()(this->ctx, stress_device, 0, 9);
+    resmem_var_op()(stress_device, 9);
+    setmem_var_op()(stress_device, 0, 9);
     std::vector<FPTYPE> sigmanlc(9, 0.0);
 
     hamilt::FS_Nonlocal_tools<FPTYPE, Device> nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr);
@@ -201,8 +201,8 @@ void Sto_Stress_PW<FPTYPE, Device>::sto_stress_nl(ModuleBase::matrix& sigma,
     }
 
     // transfer stress from device to host
-    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigmanlc.data(), stress_device, 9);
-    delmem_var_op()(this->ctx, stress_device);
+    syncmem_var_d2h_op()(sigmanlc.data(), stress_device, 9);
+    delmem_var_op()(stress_device);
     // sum up forcenl from all processors
     for (int l = 0; l < 3; l++)
     {
diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp
index 8a76daa9e9..d3d720106c 100644
--- a/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp
+++ b/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp
@@ -52,7 +52,7 @@ void Stochastic_WF<T, Device>::allocate_chiallorder(const int& norder)
     for (int ik = 0; ik < this->nks; ++ik)
     {
         chiallorder[ik].resize(1, this->nchip[ik] * this->npwx, norder);
-        setmem_complex_op()(chiallorder[ik].get_device(), chiallorder[ik].get_pointer(), 0, chiallorder[ik].size());
+        setmem_complex_op()(chiallorder[ik].get_pointer(), 0, chiallorder[ik].size());
     }
 }
 
@@ -374,9 +374,7 @@ void Stochastic_WF<T, Device>::sync_chi0()
     Device* ctx = {};
     if (base_device::get_device_type<Device>(ctx) == base_device::GpuDevice)
     {
-        syncmem_h2d_op()(this->chi0->get_device(),
-                         this->chi0_cpu->get_device(),
-                         this->chi0->get_pointer(),
+        syncmem_h2d_op()(this->chi0->get_pointer(),
                          this->chi0_cpu->get_pointer(),
                          this->chi0_cpu->size());
     }
diff --git a/source/module_hsolver/diag_const_nums.cpp b/source/module_hsolver/diag_const_nums.cpp
index 8b459cbf7c..4d9cb8fd83 100644
--- a/source/module_hsolver/diag_const_nums.cpp
+++ b/source/module_hsolver/diag_const_nums.cpp
@@ -11,14 +11,11 @@ template class const_nums<std::complex<float>>;
 template <>
 const_nums<double>::const_nums()
 {
-    base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->zero, 1);
+    base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>()(this->zero, 1);
     this->zero[0] = 0.0;
-    base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->one, 1);
+    base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>()(this->one, 1);
     this->one[0] = 1.0;
-    base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->neg_one, 1);
+    base_device::memory::resize_memory_op<double, base_device::DEVICE_CPU>()(this->neg_one, 1);
     this->neg_one[0] = -1.0;
 }
 
@@ -26,14 +23,11 @@ const_nums<double>::const_nums()
 template <>
 const_nums<float>::const_nums()
 {
-    base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->zero, 1);
+    base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>()(this->zero, 1);
     this->zero[0] = 0.0;
-    base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->one, 1);
+    base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>()(this->one, 1);
     this->one[0] = 1.0;
-    base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->neg_one, 1);
+    base_device::memory::resize_memory_op<float, base_device::DEVICE_CPU>()(this->neg_one, 1);
     this->neg_one[0] = -1.0;
 }
 
@@ -41,14 +35,11 @@ const_nums<float>::const_nums()
 template <>
 const_nums<std::complex<double>>::const_nums()
 {
-    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->zero, 1);
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(this->zero, 1);
     this->zero[0] = std::complex<double>(0.0, 0.0);
-    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->one, 1);
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(this->one, 1);
     this->one[0] = std::complex<double>(1.0, 0.0);
-    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->neg_one, 1);
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(this->neg_one, 1);
     this->neg_one[0] = std::complex<double>(-1.0, 0.0);
 }
 
@@ -56,13 +47,10 @@ const_nums<std::complex<double>>::const_nums()
 template <>
 const_nums<std::complex<float>>::const_nums()
 {
-    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->zero, 1);
+    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>()(this->zero, 1);
     this->zero[0] = std::complex<float>(0.0, 0.0);
-    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->one, 1);
+    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>()(this->one, 1);
     this->one[0] = std::complex<float>(1.0, 0.0);
-    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->neg_one, 1);
+    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_CPU>()(this->neg_one, 1);
     this->neg_one[0] = std::complex<float>(-1.0, 0.0);
 }
\ No newline at end of file
diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp
index 82dadcb0d0..f7daf229a2 100644
--- a/source/module_hsolver/diago_dav_subspace.cpp
+++ b/source/module_hsolver/diago_dav_subspace.cpp
@@ -46,30 +46,30 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
     // TODO: Added memory usage statistics
 
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-    resmem_complex_op()(this->ctx, this->psi_in_iter, this->nbase_x * this->dim, "DAV::psi_in_iter");
-    setmem_complex_op()(this->ctx, this->psi_in_iter, 0, this->nbase_x * this->dim);
+    resmem_complex_op()(this->psi_in_iter, this->nbase_x * this->dim, "DAV::psi_in_iter");
+    setmem_complex_op()(this->psi_in_iter, 0, this->nbase_x * this->dim);
 
     // the product of H and psi in the reduced psi set
-    resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi");
-    setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim);
+    resmem_complex_op()(this->hphi, this->nbase_x * this->dim, "DAV::hphi");
+    setmem_complex_op()(this->hphi, 0, this->nbase_x * this->dim);
 
     // Hamiltonian on the reduced psi set
-    resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc");
-    setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x);
+    resmem_complex_op()(this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc");
+    setmem_complex_op()(this->hcc, 0, this->nbase_x * this->nbase_x);
 
     // Overlap on the reduced psi set
-    resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc");
-    setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x);
+    resmem_complex_op()(this->scc, this->nbase_x * this->nbase_x, "DAV::scc");
+    setmem_complex_op()(this->scc, 0, this->nbase_x * this->nbase_x);
 
     // Eigenvectors
-    resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc");
-    setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x);
+    resmem_complex_op()(this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc");
+    setmem_complex_op()(this->vcc, 0, this->nbase_x * this->nbase_x);
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        resmem_real_op()(this->ctx, this->d_precondition, nbasis_in);
+        resmem_real_op()(this->d_precondition, nbasis_in);
         // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in);
     }
 #endif
@@ -78,17 +78,17 @@ Diago_DavSubspace<T, Device>::Diago_DavSubspace(const std::vector<Real>& precond
 template <typename T, typename Device>
 Diago_DavSubspace<T, Device>::~Diago_DavSubspace()
 {
-    delmem_complex_op()(this->ctx, this->psi_in_iter);
+    delmem_complex_op()(this->psi_in_iter);
 
-    delmem_complex_op()(this->ctx, this->hphi);
-    delmem_complex_op()(this->ctx, this->hcc);
-    delmem_complex_op()(this->ctx, this->scc);
-    delmem_complex_op()(this->ctx, this->vcc);
+    delmem_complex_op()(this->hphi);
+    delmem_complex_op()(this->hcc);
+    delmem_complex_op()(this->scc);
+    delmem_complex_op()(this->vcc);
 
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        delmem_real_op()(this->ctx, this->d_precondition);
+        delmem_real_op()(this->d_precondition);
     }
 #endif
 }
@@ -123,9 +123,7 @@ int Diago_DavSubspace<T, Device>::diag_once(const HPsiFunc& hpsi_func,
     {
         unconv[m] = m;
 
-        syncmem_complex_op()(this->ctx,
-                             this->ctx,
-                             this->psi_in_iter + m * this->dim,
+        syncmem_complex_op()(this->psi_in_iter + m * this->dim,
                              psi_in + m * psi_in_dmax,
                              this->dim);
     }
@@ -190,7 +188,7 @@ int Diago_DavSubspace<T, Device>::diag_once(const HPsiFunc& hpsi_func,
             ModuleBase::timer::tick("Diago_DavSubspace", "last");
 
             // updata eigenvectors of Hamiltonian
-            setmem_complex_op()(this->ctx, psi_in, 0, n_band * psi_in_dmax);
+            setmem_complex_op()(psi_in, 0, n_band * psi_in_dmax);
 
 #ifdef __DSP
     gemm_op_mt<T, Device>()  // In order to not coding another whole template, using this method to minimize the code change.
@@ -228,9 +226,7 @@ int Diago_DavSubspace<T, Device>::diag_once(const HPsiFunc& hpsi_func,
                 // update this->psi_in_iter according to psi_in
                 for (size_t i = 0; i < this->n_band; i++)
                 {
-                    syncmem_complex_op()(this->ctx,
-                                         this->ctx,
-                                         this->psi_in_iter + i * this->dim,
+                    syncmem_complex_op()(this->psi_in_iter + i * this->dim,
                                          psi_in + i * psi_in_dmax,
                                          this->dim);
                 }
@@ -273,7 +269,7 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     {
         if (unconv[i] != i)
         {
-            syncmem_complex_op()(this->ctx, this->ctx, vcc + i * this->nbase_x, vcc + unconv[i] * this->nbase_x, nbase);
+            syncmem_complex_op()(vcc + i * this->nbase_x, vcc + unconv[i] * this->nbase_x, nbase);
             (*eigenvalue_iter)[i] = (*eigenvalue_iter)[unconv[i]];
         }
     }
@@ -303,14 +299,14 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     if(this->device == base_device::GpuDevice)
     {
         e_temp_hd = nullptr;
-        resmem_real_op()(this->ctx, e_temp_hd, nbase);
+        resmem_real_op()(e_temp_hd, nbase);
     }
     for (int m = 0; m < notconv; m++)
     {
         e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m]));
         if (this->device == base_device::GpuDevice)
         {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase);
+            syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), nbase);
         }
         vector_mul_vector_op<T, Device>()(this->ctx,
                                                 nbase,
@@ -320,7 +316,7 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     }
     if(this->device == base_device::GpuDevice)
     {
-        delmem_real_op()(this->ctx, e_temp_hd);
+        delmem_real_op()(e_temp_hd);
     }
 
 #ifdef __DSP
@@ -356,7 +352,7 @@ void Diago_DavSubspace<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
 #if defined(__CUDA) || defined(__ROCM)
         if (this->device == base_device::GpuDevice)
         {
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim);
+            syncmem_var_h2d_op()(this->d_precondition, pre.data(), this->dim);
             vector_div_vector_op<T, Device>()(this->ctx,
                                               this->dim,
                                               psi_iter + (nbase + m) * this->dim,
@@ -461,7 +457,7 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
 #else
         auto* swap = new T[notconv * this->nbase_x];
 
-        syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x);
+        syncmem_complex_op()(swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x);
 
         if (std::is_same<T, double>::value)
         {
@@ -491,7 +487,7 @@ void Diago_DavSubspace<T, Device>::cal_elem(const int& dim,
                            this->diag_comm.comm);
             }
 
-            syncmem_complex_op()(this->ctx, this->ctx, swap, scc + nbase * this->nbase_x, notconv * this->nbase_x);
+            syncmem_complex_op()(swap, scc + nbase * this->nbase_x, notconv * this->nbase_x);
 
             if (base_device::get_current_precision(swap) == "single")
             {
@@ -544,33 +540,33 @@ void Diago_DavSubspace<T, Device>::diag_zhegvx(const int& nbase,
         if (this->diag_comm.rank == 0)
         {
             Real* eigenvalue_gpu = nullptr;
-            resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x);
+            resmem_real_op()(eigenvalue_gpu, this->nbase_x);
 
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalue_gpu, (*eigenvalue_iter).data(), this->nbase_x);
+            syncmem_var_h2d_op()(eigenvalue_gpu, (*eigenvalue_iter).data(), this->nbase_x);
 
             T* hcc_gpu = nullptr;
             T* scc_gpu = nullptr;
             T* vcc_gpu = nullptr;
-            base_device::memory::resize_memory_op<T, Device>()(this->ctx, hcc_gpu, nbase * nbase);
-            base_device::memory::resize_memory_op<T, Device>()(this->ctx, scc_gpu, nbase * nbase);
-            base_device::memory::resize_memory_op<T, Device>()(this->ctx, vcc_gpu, nbase * nbase);
+            base_device::memory::resize_memory_op<T, Device>()(hcc_gpu, nbase * nbase);
+            base_device::memory::resize_memory_op<T, Device>()(scc_gpu, nbase * nbase);
+            base_device::memory::resize_memory_op<T, Device>()(vcc_gpu, nbase * nbase);
             for(int i=0;i<nbase;i++)
             {
-                base_device::memory::synchronize_memory_op<T, Device, Device>()(this->ctx, this->ctx, hcc_gpu + i * nbase, hcc + i * nbase_x, nbase);
-                base_device::memory::synchronize_memory_op<T, Device, Device>()(this->ctx, this->ctx, scc_gpu + i * nbase, scc + i * nbase_x, nbase);
+                base_device::memory::synchronize_memory_op<T, Device, Device>()(hcc_gpu + i * nbase, hcc + i * nbase_x, nbase);
+                base_device::memory::synchronize_memory_op<T, Device, Device>()(scc_gpu + i * nbase, scc + i * nbase_x, nbase);
             }
             dngvd_op<T, Device>()(this->ctx, nbase, nbase, hcc_gpu, scc_gpu, eigenvalue_gpu, vcc_gpu);
             for(int i=0;i<nbase;i++)
             {
-                base_device::memory::synchronize_memory_op<T, Device, Device>()(this->ctx, this->ctx, vcc + i * nbase_x, vcc_gpu + i * nbase, nbase);
+                base_device::memory::synchronize_memory_op<T, Device, Device>()(vcc + i * nbase_x, vcc_gpu + i * nbase, nbase);
             }
-            delmem_complex_op()(this->ctx, hcc_gpu);
-            delmem_complex_op()(this->ctx, scc_gpu);
-            delmem_complex_op()(this->ctx, vcc_gpu);
+            delmem_complex_op()(hcc_gpu);
+            delmem_complex_op()(scc_gpu);
+            delmem_complex_op()(vcc_gpu);
 
-            syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, (*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x);
+            syncmem_var_d2h_op()((*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x);
 
-            delmem_real_op()(this->ctx, eigenvalue_gpu);
+            delmem_real_op()(eigenvalue_gpu);
         }
 #endif
     }
@@ -715,16 +711,16 @@ void Diago_DavSubspace<T, Device>::refresh(const int& dim,
                          this->dim);
 
     // update hphi
-    syncmem_complex_op()(this->ctx, this->ctx, hphi, psi_iter + nband * this->dim, this->dim * nband);
+    syncmem_complex_op()(hphi, psi_iter + nband * this->dim, this->dim * nband);
 
     nbase = nband;
 
     // set hcc/scc/vcc to 0
     for (size_t i = 0; i < nbase; i++)
     {
-        setmem_complex_op()(this->ctx, &hcc[this->nbase_x * i], 0, nbase);
-        setmem_complex_op()(this->ctx, &scc[this->nbase_x * i], 0, nbase);
-        setmem_complex_op()(this->ctx, &vcc[this->nbase_x * i], 0, nbase);
+        setmem_complex_op()(&hcc[this->nbase_x * i], 0, nbase);
+        setmem_complex_op()(&scc[this->nbase_x * i], 0, nbase);
+        setmem_complex_op()(&vcc[this->nbase_x * i], 0, nbase);
     }
 
     if (this->device == base_device::GpuDevice)
@@ -733,22 +729,19 @@ void Diago_DavSubspace<T, Device>::refresh(const int& dim,
         T* hcc_cpu = nullptr;
         T* scc_cpu = nullptr;
         T* vcc_cpu = nullptr;
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx,
-                                                                            hcc_cpu,
+        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(hcc_cpu,
                                                                             this->nbase_x * this->nbase_x,
                                                                             "DAV::hcc");
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx,
-                                                                            scc_cpu,
+        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(scc_cpu,
                                                                             this->nbase_x * this->nbase_x,
                                                                             "DAV::scc");
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx,
-                                                                            vcc_cpu,
+        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(vcc_cpu,
                                                                             this->nbase_x * this->nbase_x,
                                                                             "DAV::vcc");
 
-        syncmem_d2h_op()(this->cpu_ctx, this->ctx, hcc_cpu, hcc, this->nbase_x * this->nbase_x);
-        syncmem_d2h_op()(this->cpu_ctx, this->ctx, scc_cpu, scc, this->nbase_x * this->nbase_x);
-        syncmem_d2h_op()(this->cpu_ctx, this->ctx, vcc_cpu, vcc, this->nbase_x * this->nbase_x);
+        syncmem_d2h_op()(hcc_cpu, hcc, this->nbase_x * this->nbase_x);
+        syncmem_d2h_op()(scc_cpu, scc, this->nbase_x * this->nbase_x);
+        syncmem_d2h_op()(vcc_cpu, vcc, this->nbase_x * this->nbase_x);
 
         for (int i = 0; i < nbase; i++)
         {
@@ -757,13 +750,13 @@ void Diago_DavSubspace<T, Device>::refresh(const int& dim,
             vcc_cpu[i * this->nbase_x + i] = this->one[0];
         }
 
-        syncmem_h2d_op()(this->ctx, this->cpu_ctx, hcc, hcc_cpu, this->nbase_x * this->nbase_x);
-        syncmem_h2d_op()(this->ctx, this->cpu_ctx, scc, scc_cpu, this->nbase_x * this->nbase_x);
-        syncmem_h2d_op()(this->ctx, this->cpu_ctx, vcc, vcc_cpu, this->nbase_x * this->nbase_x);
+        syncmem_h2d_op()(hcc, hcc_cpu, this->nbase_x * this->nbase_x);
+        syncmem_h2d_op()(scc, scc_cpu, this->nbase_x * this->nbase_x);
+        syncmem_h2d_op()(vcc, vcc_cpu, this->nbase_x * this->nbase_x);
 
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx, hcc_cpu);
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx, scc_cpu);
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx, vcc_cpu);
+        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(hcc_cpu);
+        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(scc_cpu);
+        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(vcc_cpu);
 #endif
     }
     else
diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp
index b4805a82fa..6afaf998b8 100644
--- a/source/module_hsolver/diago_david.cpp
+++ b/source/module_hsolver/diago_david.cpp
@@ -59,47 +59,45 @@ DiagoDavid<T, Device>::DiagoDavid(const Real* precondition_in,
     /// - "basis" : number of occupied ks-orbitals(subscripts i,j) * number of unoccupied ks-orbitals(subscripts a,b), corresponding to "bands" of the ground state
 
     // the lowest N eigenvalues
-    base_device::memory::resize_memory_op<Real, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->eigenvalue, nbase_x, "DAV::eig");
-    base_device::memory::set_memory_op<Real, base_device::DEVICE_CPU>()(
-                        this->cpu_ctx, this->eigenvalue, 0, nbase_x);
+    base_device::memory::resize_memory_op<Real, base_device::DEVICE_CPU>()(this->eigenvalue, nbase_x, "DAV::eig");
+    base_device::memory::set_memory_op<Real, base_device::DEVICE_CPU>()(this->eigenvalue, 0, nbase_x);
 
     // basis(dim, nbase_x), leading dimension = dim
-    resmem_complex_op()(this->ctx, basis, nbase_x * dim, "DAV::basis");
-    setmem_complex_op()(this->ctx, basis, 0, nbase_x * dim);
+    resmem_complex_op()(basis, nbase_x * dim, "DAV::basis");
+    setmem_complex_op()(basis, 0, nbase_x * dim);
 
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
     // hpsi(nbase_x, dim); // the product of H and psi in the reduced basis set
-    resmem_complex_op()(this->ctx, this->hpsi, nbase_x * dim, "DAV::hpsi");
-    setmem_complex_op()(this->ctx, this->hpsi, 0, nbase_x * dim);
+    resmem_complex_op()(this->hpsi, nbase_x * dim, "DAV::hpsi");
+    setmem_complex_op()(this->hpsi, 0, nbase_x * dim);
 
     // spsi(nbase_x, dim); // the Product of S and psi in the reduced basis set
-    resmem_complex_op()(this->ctx, this->spsi, nbase_x * dim, "DAV::spsi");
-    setmem_complex_op()(this->ctx, this->spsi, 0, nbase_x * dim);
+    resmem_complex_op()(this->spsi, nbase_x * dim, "DAV::spsi");
+    setmem_complex_op()(this->spsi, 0, nbase_x * dim);
 
     // hcc(nbase_x, nbase_x); // Hamiltonian on the reduced basis
-    resmem_complex_op()(this->ctx, this->hcc, nbase_x * nbase_x, "DAV::hcc");
-    setmem_complex_op()(this->ctx, this->hcc, 0, nbase_x * nbase_x);
+    resmem_complex_op()(this->hcc, nbase_x * nbase_x, "DAV::hcc");
+    setmem_complex_op()(this->hcc, 0, nbase_x * nbase_x);
 
     // scc(nbase_x, nbase_x); // Overlap on the reduced basis
     // resmem_complex_op()(this->ctx, this->scc, nbase_x * nbase_x, "DAV::scc");
     // setmem_complex_op()(this->ctx, this->scc, 0, nbase_x * nbase_x);
 
     // vcc(nbase_x, nbase_x); // Eigenvectors of hcc
-    resmem_complex_op()(this->ctx, this->vcc, nbase_x * nbase_x, "DAV::vcc");
-    setmem_complex_op()(this->ctx, this->vcc, 0, nbase_x * nbase_x);
+    resmem_complex_op()(this->vcc, nbase_x * nbase_x, "DAV::vcc");
+    setmem_complex_op()(this->vcc, 0, nbase_x * nbase_x);
     //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
     
     // lagrange_matrix(nband, nband); // for orthogonalization
-    resmem_complex_op()(this->ctx, this->lagrange_matrix, nband * nband);
-    setmem_complex_op()(this->ctx, this->lagrange_matrix, 0, nband * nband);
+    resmem_complex_op()(this->lagrange_matrix, nband * nband);
+    setmem_complex_op()(this->lagrange_matrix, 0, nband * nband);
 
 #if defined(__CUDA) || defined(__ROCM)
     // device precondition array
     if (this->device == base_device::GpuDevice)
     {
-        resmem_var_op()(this->ctx, this->d_precondition, dim);
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition, dim);
+        resmem_var_op()(this->d_precondition, dim);
+        syncmem_var_h2d_op()(this->d_precondition, this->precondition, dim);
     }
 #endif
 }
@@ -107,19 +105,19 @@ DiagoDavid<T, Device>::DiagoDavid(const Real* precondition_in,
 template <typename T, typename Device>
 DiagoDavid<T, Device>::~DiagoDavid()
 {
-    delmem_complex_op()(this->ctx, this->basis);
-    delmem_complex_op()(this->ctx, this->hpsi);
-    delmem_complex_op()(this->ctx, this->spsi);
-    delmem_complex_op()(this->ctx, this->hcc);
+    delmem_complex_op()(this->basis);
+    delmem_complex_op()(this->hpsi);
+    delmem_complex_op()(this->spsi);
+    delmem_complex_op()(this->hcc);
     // delmem_complex_op()(this->ctx, this->scc);
-    delmem_complex_op()(this->ctx, this->vcc);
-    delmem_complex_op()(this->ctx, this->lagrange_matrix);
-    base_device::memory::delete_memory_op<Real, base_device::DEVICE_CPU>()(this->cpu_ctx, this->eigenvalue);
+    delmem_complex_op()(this->vcc);
+    delmem_complex_op()(this->lagrange_matrix);
+    base_device::memory::delete_memory_op<Real, base_device::DEVICE_CPU>()(this->eigenvalue);
     // If the device is a GPU device, free the d_precondition array.
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == base_device::GpuDevice)
     {
-        delmem_var_op()(this->ctx, this->d_precondition);
+        delmem_var_op()(this->d_precondition);
     }
 #endif
 }
@@ -181,7 +179,7 @@ int DiagoDavid<T, Device>::diag_once(const HPsiFunc& hpsi_func,
     // begin SchmidtOrth
     for (int m = 0; m < nband; m++)
     {
-        syncmem_complex_op()(this->ctx, this->ctx, basis + dim*m, psi_in + m*ld_psi, dim);
+        syncmem_complex_op()(basis + dim*m, psi_in + m*ld_psi, dim);
 
         this->SchmidtOrth(dim,
                          nband,
@@ -266,7 +264,7 @@ int DiagoDavid<T, Device>::diag_once(const HPsiFunc& hpsi_func,
 
             // update eigenvectors of Hamiltonian
 
-            setmem_complex_op()(this->ctx, psi_in, 0, nband * ld_psi);
+            setmem_complex_op()(psi_in, 0, nband * ld_psi);
             //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
             gemm_op<T, Device>()(this->ctx,
                                       'N',
@@ -353,8 +351,8 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     // vc_ev_vector(notconv, nbase);
     // eigenvectors of unconverged index extracted from vcc
     T* vc_ev_vector = nullptr;
-    resmem_complex_op()(this->ctx, vc_ev_vector, notconv * nbase);
-    setmem_complex_op()(this->ctx, vc_ev_vector, 0, notconv * nbase);
+    resmem_complex_op()(vc_ev_vector, notconv * nbase);
+    setmem_complex_op()(vc_ev_vector, 0, notconv * nbase);
 
     //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
     // for (int m = 0; m < notconv; m++)
@@ -372,9 +370,7 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     //         vc_ev_vector[m * nbase + i] = vcc[i * nbase_x + unconv[m]];
     for (int m = 0; m < notconv; m++)
     {
-        syncmem_complex_op()(this->ctx,
-                             this->ctx,
-                             vc_ev_vector + m * nbase,
+        syncmem_complex_op()(vc_ev_vector + m * nbase,
                              vcc + unconv[m] * nbase_x,
                              nbase);
     }
@@ -419,14 +415,14 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
         {
 #if defined(__CUDA) || defined(__ROCM)
             Real* e_temp_gpu = nullptr;
-            resmem_var_op()(this->ctx, e_temp_gpu, nbase);
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_gpu, e_temp_cpu.data(), nbase);
+            resmem_var_op()(e_temp_gpu, nbase);
+            syncmem_var_h2d_op()(e_temp_gpu, e_temp_cpu.data(), nbase);
             vector_mul_vector_op<T, Device>()(this->ctx,
                                                    nbase,
                                                    vc_ev_vector + m * nbase,
                                                    vc_ev_vector + m * nbase,
                                                    e_temp_gpu);
-            delmem_var_op()(this->ctx, e_temp_gpu);
+            delmem_var_op()(e_temp_gpu);
 #endif
         }
         else
@@ -499,8 +495,8 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     // there is a nbase to nbase + notconv band orthogonalise
     // plan for SchmidtOrth
     T* lagrange = nullptr;
-    resmem_complex_op()(this->ctx, lagrange, notconv * (nbase + notconv));
-    setmem_complex_op()(this->ctx, lagrange, 0, notconv * (nbase + notconv));
+    resmem_complex_op()(lagrange, notconv * (nbase + notconv));
+    setmem_complex_op()(lagrange, 0, notconv * (nbase + notconv));
 
     std::vector<int> pre_matrix_mm_m(notconv, 0);
     std::vector<int> pre_matrix_mv_m(notconv, 1);
@@ -569,8 +565,8 @@ void DiagoDavid<T, Device>::cal_grad(const HPsiFunc& hpsi_func,
     // hpsi[:, nbase:nbase+notcnv] = H basis[:, nbase:nbase+notcnv]
     hpsi_func(basis + nbase * dim, hpsi + nbase * dim, dim, notconv);
 
-    delmem_complex_op()(this->ctx, lagrange);
-    delmem_complex_op()(this->ctx, vc_ev_vector);
+    delmem_complex_op()(lagrange);
+    delmem_complex_op()(vc_ev_vector);
 
     ModuleBase::timer::tick("DiagoDavid", "cal_grad");
     return;
@@ -635,7 +631,7 @@ void DiagoDavid<T, Device>::cal_elem(const int& dim,
         // matrixTranspose_op<T, Device>()(this->ctx, nbase_x, nbase_x, scc, scc);
 
         auto* swap = new T[notconv * nbase_x];
-        syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * nbase_x, notconv * nbase_x);
+        syncmem_complex_op()(swap, hcc + nbase * nbase_x, notconv * nbase_x);
         if (std::is_same<T, double>::value)
         {
             Parallel_Reduce::reduce_pool(hcc + nbase * nbase_x, notconv * nbase_x);
@@ -700,13 +696,13 @@ void DiagoDavid<T, Device>::diag_zhegvx(const int& nbase,
         {
 #if defined(__CUDA) || defined(__ROCM)
             Real* eigenvalue_gpu = nullptr;
-            resmem_var_op()(this->ctx, eigenvalue_gpu, nbase_x);
-            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalue_gpu, this->eigenvalue, nbase_x);
+            resmem_var_op()(eigenvalue_gpu, nbase_x);
+            syncmem_var_h2d_op()(eigenvalue_gpu, this->eigenvalue, nbase_x);
 
             dnevx_op<T, Device>()(this->ctx, nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc);
 
-            syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, this->eigenvalue, eigenvalue_gpu, nbase_x);
-            delmem_var_op()(this->ctx, eigenvalue_gpu);
+            syncmem_var_d2h_op()(this->eigenvalue, eigenvalue_gpu, nbase_x);
+            delmem_var_op()(eigenvalue_gpu);
 #endif
         }
         else
@@ -752,7 +748,7 @@ void DiagoDavid<T, Device>::refresh(const int& dim,
     ModuleBase::timer::tick("DiagoDavid", "refresh");
 
     // update hp,sp
-    setmem_complex_op()(this->ctx, basis , 0, nbase_x * dim);
+    setmem_complex_op()(basis , 0, nbase_x * dim);
 
     // basis(dim, nband) = hpsi(dim, nbase) * vcc(nbase, nband)
     gemm_op<T, Device>()(this->ctx,
@@ -790,8 +786,8 @@ void DiagoDavid<T, Device>::refresh(const int& dim,
     );
 
     // hpsi = basis, spsi = basis[nband]
-    syncmem_complex_op()(this->ctx, this->ctx, hpsi, basis, dim * nband);
-    syncmem_complex_op()(this->ctx, this->ctx, spsi, basis + dim*nband, dim * nband);
+    syncmem_complex_op()(hpsi, basis, dim * nband);
+    syncmem_complex_op()(spsi, basis + dim*nband, dim * nband);
     /*for (int m = 0; m < nband; m++) {
         for (int ig = 0; ig < dim; ig++)
         {
@@ -801,11 +797,11 @@ void DiagoDavid<T, Device>::refresh(const int& dim,
     }*/
 
     // update basis
-    setmem_complex_op()(this->ctx, basis , 0, nbase_x * dim);
+    setmem_complex_op()(basis , 0, nbase_x * dim);
 
     for (int m = 0; m < nband; m++)
     {
-        syncmem_complex_op()(this->ctx, this->ctx, basis + dim*m,psi_in + m*ld_psi, dim);
+        syncmem_complex_op()(basis + dim*m,psi_in + m*ld_psi, dim);
         /*for (int ig = 0; ig < npw; ig++)
             basis(m, ig) = psi(m, ig);*/
     }
@@ -814,7 +810,7 @@ void DiagoDavid<T, Device>::refresh(const int& dim,
     // basis set size reset to nband
     nbase = nband;
 
-    setmem_complex_op()(this->ctx, hcc, 0, nbase_x * nbase_x);
+    setmem_complex_op()(hcc, 0, nbase_x * nbase_x);
 
     // setmem_complex_op()(this->ctx, scc, 0, nbase_x * nbase_x);
 
@@ -824,22 +820,20 @@ void DiagoDavid<T, Device>::refresh(const int& dim,
         T* hcc_cpu = nullptr;
         // T* scc_cpu = nullptr;
         T* vcc_cpu = nullptr;
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx,
-                                                                            hcc_cpu,
+        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(hcc_cpu,
                                                                             nbase_x * nbase_x,
                                                                             "DAV::hcc");
         // base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx,
         //                                                                     scc_cpu,
         //                                                                     nbase_x * nbase_x,
         //                                                                     "DAV::scc");
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx,
-                                                                            vcc_cpu,
+        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(vcc_cpu,
                                                                             nbase_x * nbase_x,
                                                                             "DAV::vcc");
 
-        syncmem_d2h_op()(this->cpu_ctx, this->ctx, hcc_cpu, hcc, nbase_x * nbase_x);
+        syncmem_d2h_op()(hcc_cpu, hcc, nbase_x * nbase_x);
         // syncmem_d2h_op()(this->cpu_ctx, this->ctx, scc_cpu, scc, nbase_x * nbase_x);
-        syncmem_d2h_op()(this->cpu_ctx, this->ctx, vcc_cpu, vcc, nbase_x * nbase_x);
+        syncmem_d2h_op()(vcc_cpu, vcc, nbase_x * nbase_x);
 
         for (int i = 0; i < nbase; i++)
         {
@@ -848,13 +842,13 @@ void DiagoDavid<T, Device>::refresh(const int& dim,
             vcc_cpu[i * nbase_x + i] = this->one[0];
         }
 
-        syncmem_h2d_op()(this->ctx, this->cpu_ctx, hcc, hcc_cpu, nbase_x * nbase_x);
+        syncmem_h2d_op()(hcc, hcc_cpu, nbase_x * nbase_x);
         // syncmem_h2d_op()(this->ctx, this->cpu_ctx, scc, scc_cpu, nbase_x * nbase_x);
-        syncmem_h2d_op()(this->ctx, this->cpu_ctx, vcc, vcc_cpu, nbase_x * nbase_x);
+        syncmem_h2d_op()(vcc, vcc_cpu, nbase_x * nbase_x);
 
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx, hcc_cpu);
+        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(hcc_cpu);
         // base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx, scc_cpu);
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(this->cpu_ctx, vcc_cpu);
+        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(vcc_cpu);
 #endif
     }
     else
@@ -941,7 +935,7 @@ void DiagoDavid<T, Device>::SchmidtOrth(const int& dim,
     Parallel_Reduce::reduce_pool(lagrange_m, m + 1);
 
     T var = *this->zero;
-    syncmem_d2h_op()(this->cpu_ctx, this->ctx, &var, lagrange_m + m, 1);
+    syncmem_d2h_op()(&var, lagrange_m + m, 1);
     double psi_norm = get_real(var);
 
     assert(psi_norm > 0.0);
diff --git a/source/module_hsolver/diago_iter_assist.cpp b/source/module_hsolver/diago_iter_assist.cpp
index c05ecdf8ec..5a3acf8e53 100644
--- a/source/module_hsolver/diago_iter_assist.cpp
+++ b/source/module_hsolver/diago_iter_assist.cpp
@@ -42,12 +42,12 @@ void DiagoIterAssist<T, Device>::diagH_subspace(const hamilt::Hamilt<T, Device>*
     assert(n_band <= nstart);
 
     T *hcc = nullptr, *scc = nullptr, *vcc = nullptr;
-    resmem_complex_op()(ctx, hcc, nstart * nstart, "DiagSub::hcc");
-    resmem_complex_op()(ctx, scc, nstart * nstart, "DiagSub::scc");
-    resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc");
-    setmem_complex_op()(ctx, hcc, 0, nstart * nstart);
-    setmem_complex_op()(ctx, scc, 0, nstart * nstart);
-    setmem_complex_op()(ctx, vcc, 0, nstart * nstart);
+    resmem_complex_op()(hcc, nstart * nstart, "DiagSub::hcc");
+    resmem_complex_op()(scc, nstart * nstart, "DiagSub::scc");
+    resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc");
+    setmem_complex_op()(hcc, 0, nstart * nstart);
+    setmem_complex_op()(scc, 0, nstart * nstart);
+    setmem_complex_op()(vcc, 0, nstart * nstart);
 
     const int dmin = psi.get_current_ngk();
     const int dmax = psi.get_nbasis();
@@ -61,11 +61,11 @@ void DiagoIterAssist<T, Device>::diagH_subspace(const hamilt::Hamilt<T, Device>*
     }
     else
     {
-        resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp");
+        resmem_complex_op()(temp, nstart * dmax, "DiagSub::temp");
     }
 
     { // code block to calculate hcc and scc
-        setmem_complex_op()(ctx, temp, 0, nstart * dmax);
+        setmem_complex_op()(temp, 0, nstart * dmax);
 
         T* hphi = temp;
         // do hPsi for all bands
@@ -140,11 +140,11 @@ void DiagoIterAssist<T, Device>::diagH_subspace(const hamilt::Hamilt<T, Device>*
     if (!in_place)
     {
         matrixSetToAnother<T, Device>()(ctx, n_band, temp, ld_temp, evc.get_pointer(), dmax);
-        delmem_complex_op()(ctx, temp);
+        delmem_complex_op()(temp);
     }
-    delmem_complex_op()(ctx, hcc);
-    delmem_complex_op()(ctx, scc);
-    delmem_complex_op()(ctx, vcc);
+    delmem_complex_op()(hcc);
+    delmem_complex_op()(scc);
+    delmem_complex_op()(vcc);
 
     ModuleBase::timer::tick("DiagoIterAssist", "diagH_subspace");
 }
@@ -192,12 +192,12 @@ void DiagoIterAssist<T, Device>::diagH_subspace_init(hamilt::Hamilt<T, Device>*
     // ModuleBase::ComplexMatrix sc(nstart, nstart);
     // ModuleBase::ComplexMatrix hvec(nstart, n_band);
     T *hcc = nullptr, *scc = nullptr, *vcc = nullptr;
-    resmem_complex_op()(ctx, hcc, nstart * nstart, "DiagSub::hcc");
-    resmem_complex_op()(ctx, scc, nstart * nstart, "DiagSub::scc");
-    resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc");
-    setmem_complex_op()(ctx, hcc, 0, nstart * nstart);
-    setmem_complex_op()(ctx, scc, 0, nstart * nstart);
-    setmem_complex_op()(ctx, vcc, 0, nstart * nstart);
+    resmem_complex_op()(hcc, nstart * nstart, "DiagSub::hcc");
+    resmem_complex_op()(scc, nstart * nstart, "DiagSub::scc");
+    resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc");
+    setmem_complex_op()(hcc, 0, nstart * nstart);
+    setmem_complex_op()(scc, 0, nstart * nstart);
+    setmem_complex_op()(vcc, 0, nstart * nstart);
 
     if (base_device::get_device_type(ctx) == base_device::GpuDevice)
     {
@@ -206,15 +206,15 @@ void DiagoIterAssist<T, Device>::diagH_subspace_init(hamilt::Hamilt<T, Device>*
         T* ppsi = psi_temp.get_pointer();
         // hpsi and spsi share the temp space
         T* temp = nullptr;
-        resmem_complex_op()(ctx, temp, psi_nc, "DiagSub::temp");
-        setmem_complex_op()(ctx, temp, 0, psi_nc);
+        resmem_complex_op()(temp, psi_nc, "DiagSub::temp");
+        setmem_complex_op()(temp, 0, psi_nc);
 
         T* hpsi = temp;
         // do hPsi band by band
         for (int i = 0; i < nstart; i++)
         {
             // psi_temp is one band psi, psi is all bands psi, the range always is 1 for the only band in psi_temp
-            syncmem_complex_op()(ctx, ctx, ppsi, psi + i * psi_nc, psi_nc);
+            syncmem_complex_op()(ppsi, psi + i * psi_nc, psi_nc);
             psi::Range band_by_band_range(true, 0, 0, 0);
             hpsi_info hpsi_in(&psi_temp, band_by_band_range, hpsi);
 
@@ -229,7 +229,7 @@ void DiagoIterAssist<T, Device>::diagH_subspace_init(hamilt::Hamilt<T, Device>*
         // do sPsi band by band
         for (int i = 0; i < nstart; i++)
         {
-            syncmem_complex_op()(ctx, ctx, ppsi, psi + i * psi_nc, psi_nc);
+            syncmem_complex_op()(ppsi, psi + i * psi_nc, psi_nc);
             pHamilt->sPsi(ppsi, spsi, dmin, dmin, 1);
 
             gemv_op<T, Device>()(ctx,
@@ -245,18 +245,18 @@ void DiagoIterAssist<T, Device>::diagH_subspace_init(hamilt::Hamilt<T, Device>*
                                  scc + i * nstart,
                                  1);
         }
-        delmem_complex_op()(ctx, temp);
+        delmem_complex_op()(temp);
     }
     else if (base_device::get_device_type(ctx) == base_device::CpuDevice)
     {
         psi::Psi<T, Device> psi_temp(1, nstart, psi_nc, dmin, true);
 
         T* ppsi = psi_temp.get_pointer();
-        syncmem_complex_op()(ctx, ctx, ppsi, psi, psi_temp.size());
+        syncmem_complex_op()(ppsi, psi, psi_temp.size());
         // hpsi and spsi share the temp space
         T* temp = nullptr;
-        resmem_complex_op()(ctx, temp, nstart * psi_nc, "DiagSub::temp");
-        setmem_complex_op()(ctx, temp, 0, nstart * psi_nc);
+        resmem_complex_op()(temp, nstart * psi_nc, "DiagSub::temp");
+        setmem_complex_op()(temp, 0, nstart * psi_nc);
 
         T* hpsi = temp;
         // do hPsi for all bands
@@ -271,7 +271,7 @@ void DiagoIterAssist<T, Device>::diagH_subspace_init(hamilt::Hamilt<T, Device>*
         pHamilt->sPsi(ppsi, spsi, psi_temp.get_nbasis(), psi_temp.get_nbasis(), psi_temp.get_nbands());
 
         gemm_op<T, Device>()(ctx, 'C', 'N', nstart, nstart, dmin, &one, ppsi, dmax, spsi, dmax, &zero, scc, nstart);
-        delmem_complex_op()(ctx, temp);
+        delmem_complex_op()(temp);
 
         add_to_hcc(hcc, nstart);
 
@@ -358,9 +358,9 @@ void DiagoIterAssist<T, Device>::diagH_subspace_init(hamilt::Hamilt<T, Device>*
         // delmem_complex_op()(ctx, evctemp);
     }
 
-    delmem_complex_op()(ctx, hcc);
-    delmem_complex_op()(ctx, scc);
-    delmem_complex_op()(ctx, vcc);
+    delmem_complex_op()(hcc);
+    delmem_complex_op()(scc);
+    delmem_complex_op()(vcc);
     ModuleBase::timer::tick("DiagoIterAssist", "diagH_subspace_init");
 }
 
@@ -377,8 +377,8 @@ void DiagoIterAssist<T, Device>::diagH_LAPACK(const int nstart,
     ModuleBase::timer::tick("DiagoIterAssist", "diagH_LAPACK");
 
     Real* eigenvalues = nullptr;
-    resmem_var_op()(ctx, eigenvalues, nstart);
-    setmem_var_op()(ctx, eigenvalues, 0, nstart);
+    resmem_var_op()(eigenvalues, nstart);
+    setmem_var_op()(eigenvalues, 0, nstart);
 
     dngvd_op<T, Device>()(ctx, nstart, ldh, hcc, scc, eigenvalues, vcc);
 
@@ -386,16 +386,16 @@ void DiagoIterAssist<T, Device>::diagH_LAPACK(const int nstart,
     {
 #if ((defined __CUDA) || (defined __ROCM))
         // set eigenvalues in GPU to e in CPU
-        syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, e, eigenvalues, nbands);
+        syncmem_var_d2h_op()(e, eigenvalues, nbands);
 #endif
     }
     else if (base_device::get_device_type<Device>(ctx) == base_device::CpuDevice)
     {
         // set eigenvalues in CPU to e in CPU
-        syncmem_var_op()(ctx, ctx, e, eigenvalues, nbands);
+        syncmem_var_op()(e, eigenvalues, nbands);
     }
 
-    delmem_var_op()(ctx, eigenvalues);
+    delmem_var_op()(eigenvalues);
 
     // const bool all_eigenvalues = (nstart == nbands);
     // if (all_eigenvalues) {
@@ -423,18 +423,18 @@ void DiagoIterAssist<T, Device>::cal_hs_subspace(const hamilt::Hamilt<T, Device>
 {
     const int nstart = psi.get_nbands();
     
-    setmem_complex_op()(ctx, hcc, 0, nstart * nstart);
-    setmem_complex_op()(ctx, scc, 0, nstart * nstart);
+    setmem_complex_op()(hcc, 0, nstart * nstart);
+    setmem_complex_op()(scc, 0, nstart * nstart);
 
     const int dmin = psi.get_current_ngk();
     const int dmax = psi.get_nbasis();
 
     T* temp = nullptr;
-    resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp");
-    setmem_complex_op()(ctx, temp, 0, nstart * dmax);
+    resmem_complex_op()(temp, nstart * dmax, "DiagSub::temp");
+    setmem_complex_op()(temp, 0, nstart * dmax);
 
     { // code block to calculate hcc and scc
-        setmem_complex_op()(ctx, temp, 0, nstart * dmax);
+        setmem_complex_op()(temp, 0, nstart * dmax);
 
         T* hphi = temp;
         // do hPsi for all bands
@@ -483,7 +483,7 @@ void DiagoIterAssist<T, Device>::cal_hs_subspace(const hamilt::Hamilt<T, Device>
         Parallel_Reduce::reduce_pool(scc, nstart * nstart);
     }
 
-    delmem_complex_op()(ctx, temp);
+    delmem_complex_op()(temp);
 }
 
 template <typename T, typename Device>
@@ -502,8 +502,8 @@ void DiagoIterAssist<T, Device>::diag_responce( const T* hcc,
     const int nstart = nbands;
 
     T *vcc = nullptr;
-    resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc");
-    setmem_complex_op()(ctx, vcc, 0, nstart * nstart);
+    resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc");
+    setmem_complex_op()(vcc, 0, nstart * nstart);
 
     // after generation of H and S matrix, diag them
     DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc);
@@ -525,7 +525,7 @@ void DiagoIterAssist<T, Device>::diag_responce( const T* hcc,
                              mat_col);
     }
 
-    delmem_complex_op()(ctx, vcc);
+    delmem_complex_op()(vcc);
 
     ModuleBase::timer::tick("DiagoIterAssist", "diag_responce");
 }
@@ -545,8 +545,8 @@ void DiagoIterAssist<T, Device>::diag_subspace_psi(const T* hcc,
     const int n_band = evc.get_nbands();
 
     T *vcc = nullptr;
-    resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc");
-    setmem_complex_op()(ctx, vcc, 0, nstart * nstart);
+    resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc");
+    setmem_complex_op()(vcc, 0, nstart * nstart);
 
     // after generation of H and S matrix, diag them
     DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc);
@@ -555,8 +555,8 @@ void DiagoIterAssist<T, Device>::diag_subspace_psi(const T* hcc,
         const int dmin = evc.get_current_ngk();
         const int dmax = evc.get_nbasis();
         T* temp = nullptr;
-        resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp");
-        setmem_complex_op()(ctx, temp, 0, nstart * dmax);
+        resmem_complex_op()(temp, nstart * dmax, "DiagSub::temp");
+        setmem_complex_op()(temp, 0, nstart * dmax);
         gemm_op<T, Device>()(ctx,
                              'N',
                              'N',
@@ -572,10 +572,10 @@ void DiagoIterAssist<T, Device>::diag_subspace_psi(const T* hcc,
                              temp,
                              dmin);
         matrixSetToAnother<T, Device>()(ctx, n_band, temp, dmin, evc.get_pointer(), dmax);
-        delmem_complex_op()(ctx, temp);
+        delmem_complex_op()(temp);
     }
 
-    delmem_complex_op()(ctx, vcc);
+    delmem_complex_op()(vcc);
 
     ModuleBase::timer::tick("DiagoIterAssist", "diag_subspace_psi");
 }
diff --git a/source/module_hsolver/hsolver_lcaopw.cpp b/source/module_hsolver/hsolver_lcaopw.cpp
index 059318034a..b6e95b4c03 100644
--- a/source/module_hsolver/hsolver_lcaopw.cpp
+++ b/source/module_hsolver/hsolver_lcaopw.cpp
@@ -270,8 +270,6 @@ void HSolverLIP<T>::solve(hamilt::Hamilt<T>* pHamilt, // ESolver_KS_PW::p_hamilt
         /// calculate the contribution of Psi for charge density rho
     }
     base_device::memory::cast_memory_op<double, Real, base_device::DEVICE_CPU, base_device::DEVICE_CPU>()(
-        cpu_ctx,
-        cpu_ctx,
         pes->ekb.c,
         eigenvalues.data(),
         pes->ekb.nr * pes->ekb.nc);
diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp
index de627d3474..05ccc8acd0 100644
--- a/source/module_hsolver/hsolver_pw.cpp
+++ b/source/module_hsolver/hsolver_pw.cpp
@@ -329,8 +329,6 @@ void HSolverPW<T, Device>::solve(hamilt::Hamilt<T, Device>* pHamilt,
 
     // copy eigenvalues to ekb in ElecState
     base_device::memory::cast_memory_op<double, Real, base_device::DEVICE_CPU, base_device::DEVICE_CPU>()(
-        cpu_ctx,
-        cpu_ctx,
         // pes->ekb.c,
         out_eigenvalues,
         eigenvalues.data(),
@@ -450,8 +448,6 @@ void HSolverPW<T, Device>::hamiltSolvePsiK(hamilt::Hamilt<T, Device>* hm,
             else
             {
                 base_device::memory::synchronize_memory_op<T, Device, Device>()(
-                    this->ctx,
-                    this->ctx,
                     spsi_out.data<T>(),
                     psi_in.data<T>(),
                     static_cast<size_t>((ndim == 1 ? 1 : psi_in.shape().dim_size(0))
diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
index 70ed5ebf0b..cd3ac41812 100644
--- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu
+++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -887,7 +887,7 @@ void matrixTranspose_op<double, base_device::DEVICE_GPU>::operator()(const base_
                                                                      double* output_matrix)
 {
     double* device_temp = nullptr;
-    base_device::memory::resize_memory_op<double, base_device::DEVICE_GPU>()(d, device_temp, row * col);
+    base_device::memory::resize_memory_op<double, base_device::DEVICE_GPU>()(device_temp, row * col);
 
     if (row == col)
     {
@@ -906,13 +906,11 @@ void matrixTranspose_op<double, base_device::DEVICE_GPU>::operator()(const base_
     }
 
     base_device::memory::synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(
-        d,
-        d,
         output_matrix,
         device_temp,
         row * col);
 
-    base_device::memory::delete_memory_op<double, base_device::DEVICE_GPU>()(d, device_temp);
+    base_device::memory::delete_memory_op<double, base_device::DEVICE_GPU>()(device_temp);
 }
 
 template <>
@@ -924,7 +922,7 @@ void matrixTranspose_op<std::complex<float>, base_device::DEVICE_GPU>::operator(
     std::complex<float>* output_matrix)
 {
     std::complex<float>* device_temp = nullptr;
-    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(d, device_temp, row * col);
+    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(device_temp, row * col);
 
     if (row == col)
     {
@@ -947,13 +945,11 @@ void matrixTranspose_op<std::complex<float>, base_device::DEVICE_GPU>::operator(
     }
 
     base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(
-        d,
-        d,
         output_matrix,
         device_temp,
         row * col);
 
-    base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(d, device_temp);
+    base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(device_temp);
 
     cudaCheckOnDebug();
 
@@ -968,7 +964,7 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     std::complex<double>* output_matrix)
 {
     std::complex<double>* device_temp = nullptr;
-    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(d, device_temp, row * col);
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(device_temp, row * col);
 
     if (row == col)
     {
@@ -989,9 +985,9 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 
     base_device::memory::synchronize_memory_op<std::complex<double>,
                                                base_device::DEVICE_GPU,
-                                               base_device::DEVICE_GPU>()(d, d, output_matrix, device_temp, row * col);
+                                               base_device::DEVICE_GPU>()(output_matrix, device_temp, row * col);
 
-    base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(d, device_temp);
+    base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(device_temp);
 }
 
 template <>
diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp
index 3a752c3659..db2a12e9db 100644
--- a/source/module_hsolver/kernels/math_kernel_op.cpp
+++ b/source/module_hsolver/kernels/math_kernel_op.cpp
@@ -323,7 +323,7 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
                     T* output_matrix)
     {
         T* temp = nullptr;
-        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(d, temp, row * col, "MTransOp");
+        base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>()(temp, row * col, "MTransOp");
 #ifdef _OPENMP
 #pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T))
 #endif
@@ -341,7 +341,7 @@ struct matrixTranspose_op<T, base_device::DEVICE_CPU>
         {
             output_matrix[i] = temp[i];
         }
-        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(d, temp);
+        base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>()(temp);
     }
 };
 
diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
index ef5a1c1ece..1993ae4c64 100644
--- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
+++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu
@@ -806,7 +806,7 @@ void matrixTranspose_op<double, base_device::DEVICE_GPU>::operator()(const base_
                                                                      double* output_matrix)
 {
     double* device_temp = nullptr;
-    base_device::memory::resize_memory_op<double, base_device::DEVICE_GPU>()(d, device_temp, row * col);
+    base_device::memory::resize_memory_op<double, base_device::DEVICE_GPU>()(device_temp, row * col);
 
     if (row == col)
     {
@@ -823,13 +823,11 @@ void matrixTranspose_op<double, base_device::DEVICE_GPU>::operator()(const base_
     }
 
     base_device::memory::synchronize_memory_op<double, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(
-        d,
-        d,
         output_matrix,
         device_temp,
         row * col);
 
-    base_device::memory::delete_memory_op<double, base_device::DEVICE_GPU>()(d, device_temp);
+    base_device::memory::delete_memory_op<double, base_device::DEVICE_GPU>()(device_temp);
 }
 
 template <>
@@ -841,7 +839,7 @@ void matrixTranspose_op<std::complex<float>, base_device::DEVICE_GPU>::operator(
     std::complex<float>* output_matrix)
 {
     std::complex<float>* device_temp = nullptr;
-    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(d, device_temp, row * col);
+    base_device::memory::resize_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(device_temp, row * col);
 
     if (row == col)
     {
@@ -863,13 +861,11 @@ void matrixTranspose_op<std::complex<float>, base_device::DEVICE_GPU>::operator(
     }
 
     base_device::memory::synchronize_memory_op<std::complex<float>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(
-        d,
-        d,
         output_matrix,
         device_temp,
         row * col);
 
-    base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(d, device_temp);
+    base_device::memory::delete_memory_op<std::complex<float>, base_device::DEVICE_GPU>()(device_temp);
 }
 
 template <>
@@ -881,7 +877,7 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
     std::complex<double>* output_matrix)
 {
     std::complex<double>* device_temp = nullptr;
-    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(d, device_temp, row * col);
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(device_temp, row * col);
 
     if (row == col)
     {
@@ -898,9 +894,9 @@ void matrixTranspose_op<std::complex<double>, base_device::DEVICE_GPU>::operator
 
     base_device::memory::synchronize_memory_op<std::complex<double>,
                                                base_device::DEVICE_GPU,
-                                               base_device::DEVICE_GPU>()(d, d, output_matrix, device_temp, row * col);
+                                               base_device::DEVICE_GPU>()(output_matrix, device_temp, row * col);
 
-    base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(d, device_temp);
+    base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(device_temp);
 }
 
 template <>
diff --git a/source/module_hsolver/kernels/test/math_dngvd_test.cpp b/source/module_hsolver/kernels/test/math_dngvd_test.cpp
index 8b614ae9a0..a67b18d4be 100644
--- a/source/module_hsolver/kernels/test/math_dngvd_test.cpp
+++ b/source/module_hsolver/kernels/test/math_dngvd_test.cpp
@@ -140,8 +140,8 @@ TEST_F(TestModuleHsolverMathDngvd, transpose_gpu)
         // {-0.351417,-1.73472}, {-8.32667,2.3744}, {4.16334,3.64292}
     };
     std::complex<double>* device_transpose = nullptr;
-    resize_memory_op_Z()(gpu_ctx, device_transpose, matrix_size);
-    synchronize_memory_op_C2G_Z()(gpu_ctx, cpu_ctx, device_transpose, transpose.data(), transpose.size());
+    resize_memory_op_Z()(device_transpose, matrix_size);
+    synchronize_memory_op_C2G_Z()(device_transpose, transpose.data(), transpose.size());
 
     // run
     hsolver::createGpuBlasHandle();
@@ -162,7 +162,7 @@ TEST_F(TestModuleHsolverMathDngvd, transpose_gpu)
         {0.0, 0.0},
         // {0.0,  0.0}, {0.0,  0.0}, {0.0,  0.0}
     };
-    synchronize_memory_op_G2C_Z()(cpu_ctx, gpu_ctx, transpose_result.data(), device_transpose, transpose.size());
+    synchronize_memory_op_G2C_Z()(transpose_result.data(), device_transpose, transpose.size());
 
     // std::vector<std::complex<double> > test_result = {
     //     {-0.351417,-1.73472}, {-0.351417,-1.73472}, {-0.351417,-1.73472},
diff --git a/source/module_hsolver/kernels/test/math_kernel_test.cpp b/source/module_hsolver/kernels/test/math_kernel_test.cpp
index c2c66fb936..0781d54787 100644
--- a/source/module_hsolver/kernels/test/math_kernel_test.cpp
+++ b/source/module_hsolver/kernels/test/math_kernel_test.cpp
@@ -371,16 +371,16 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_cpu)
 TEST_F(TestModuleHsolverMathKernel, zdot_real_op_gpu)
 {
     std::complex<double>*psi_L_dev = NULL, *psi_R_dev = NULL;
-    resize_memory_op()(gpu_ctx, psi_L_dev, psi_L.size());
-    resize_memory_op()(gpu_ctx, psi_R_dev, psi_R.size());
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, psi_L_dev, psi_L.data(), psi_L.size());
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, psi_R_dev, psi_R.data(), psi_R.size());
+    resize_memory_op()(psi_L_dev, psi_L.size());
+    resize_memory_op()(psi_R_dev, psi_R.size());
+    synchronize_memory_op()(psi_L_dev, psi_L.data(), psi_L.size());
+    synchronize_memory_op()(psi_R_dev, psi_R.data(), psi_R.size());
     hsolver::createGpuBlasHandle();
     double result = zdot_real_gpu_op()(gpu_ctx, dim, psi_L_dev, psi_R_dev, false);
     hsolver::destoryBLAShandle();
     EXPECT_LT(fabs(result - expected_result), 1e-12);
-    delete_memory_op()(gpu_ctx, psi_L_dev);
-    delete_memory_op()(gpu_ctx, psi_R_dev);
+    delete_memory_op()(psi_L_dev);
+    delete_memory_op()(psi_R_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu)
@@ -390,22 +390,22 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu)
     // in GPU
     std::complex<double>* input_dev = NULL;
     std::complex<double>* output_dev = NULL;
-    resize_memory_op()(gpu_ctx, input_dev, input.size());
-    resize_memory_op()(gpu_ctx, output_dev, input.size());
+    resize_memory_op()(input_dev, input.size());
+    resize_memory_op()(output_dev, input.size());
     // syn the input data in CPU to GPU
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, input_dev, input.data(), input.size());
+    synchronize_memory_op()(input_dev, input.data(), input.size());
     // run
     vector_div_constant_op_gpu()(gpu_ctx, dim, output_dev, input_dev, constant);
     // syn the output data in GPU to CPU
-    synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size());
+    synchronize_memory_op_gpu()(output.data(), output_dev, output.size());
 
     for (int i = 0; i < input.size(); i++)
     {
         EXPECT_LT(fabs(output[i].imag() - output_vector_div_constant_op[i].imag()), 1e-8);
         EXPECT_LT(fabs(output[i].real() - output_vector_div_constant_op[i].real()), 1e-8);
     }
-    delete_memory_op()(gpu_ctx, input_dev);
-    delete_memory_op()(gpu_ctx, output_dev);
+    delete_memory_op()(input_dev);
+    delete_memory_op()(output_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, vector_mul_vector_op_gpu)
@@ -419,19 +419,19 @@ TEST_F(TestModuleHsolverMathKernel, vector_mul_vector_op_gpu)
     std::complex<double>* output_dev = NULL;
 
     // resize memory for values
-    resize_memory_op()(gpu_ctx, input_dev, input.size());
-    resize_memory_op_double()(gpu_ctx, input_double_dev, input.size());
-    resize_memory_op()(gpu_ctx, output_dev, input.size());
+    resize_memory_op()(input_dev, input.size());
+    resize_memory_op_double()(input_double_dev, input.size());
+    resize_memory_op()(output_dev, input.size());
 
     // syn the input data in CPU to GPU
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, input_dev, input.data(), input.size());
-    synchronize_memory_op_double()(gpu_ctx, cpu_ctx, input_double_dev, input_double.data(), input.size());
+    synchronize_memory_op()(input_dev, input.data(), input.size());
+    synchronize_memory_op_double()(input_double_dev, input_double.data(), input.size());
 
     // run
     vector_mul_vector_op_gpu()(gpu_ctx, dim, output_dev, input_dev, input_double_dev);
 
     // syn the output data in GPU to CPU
-    synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size());
+    synchronize_memory_op_gpu()(output.data(), output_dev, output.size());
 
     for (int i = 0; i < input.size(); i++)
     {
@@ -439,9 +439,9 @@ TEST_F(TestModuleHsolverMathKernel, vector_mul_vector_op_gpu)
         EXPECT_LT(fabs(output[i].real() - output_vector_mul_vector_op[i].real()), 1e-8);
     }
 
-    delete_memory_op()(gpu_ctx, input_dev);
-    delete_memory_op_double()(gpu_ctx, input_double_dev);
-    delete_memory_op()(gpu_ctx, output_dev);
+    delete_memory_op()(input_dev);
+    delete_memory_op_double()(input_double_dev);
+    delete_memory_op()(output_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu)
@@ -455,19 +455,19 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu)
     std::complex<double>* output_dev = NULL;
 
     // resize memory for values in GPU
-    resize_memory_op()(gpu_ctx, input_dev, input.size());
-    resize_memory_op_double()(gpu_ctx, input_double_dev, input.size());
-    resize_memory_op()(gpu_ctx, output_dev, input.size());
+    resize_memory_op()(input_dev, input.size());
+    resize_memory_op_double()(input_double_dev, input.size());
+    resize_memory_op()(output_dev, input.size());
 
     // syn the input data in CPU to GPU
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, input_dev, input.data(), input.size());
-    synchronize_memory_op_double()(gpu_ctx, cpu_ctx, input_double_dev, input_double.data(), input.size());
+    synchronize_memory_op()(input_dev, input.data(), input.size());
+    synchronize_memory_op_double()(input_double_dev, input_double.data(), input.size());
 
     // run
     vector_div_vector_op_gpu()(gpu_ctx, dim, output_dev, input_dev, input_double_dev);
 
     // syn the output data in GPU to CPU
-    synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size());
+    synchronize_memory_op_gpu()(output.data(), output_dev, output.size());
 
     for (int i = 0; i < input.size(); i++)
     {
@@ -475,9 +475,9 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu)
         EXPECT_LT(fabs(output[i].real() - output_vector_div_vector_op[i].real()), 1e-8);
     }
 
-    delete_memory_op()(gpu_ctx, input_dev);
-    delete_memory_op_double()(gpu_ctx, input_double_dev);
-    delete_memory_op()(gpu_ctx, output_dev);
+    delete_memory_op()(input_dev);
+    delete_memory_op_double()(input_double_dev);
+    delete_memory_op()(output_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gpu)
@@ -491,13 +491,13 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp
     std::complex<double>* output_dev = NULL;
 
     // resize memory for values in GPU
-    resize_memory_op()(gpu_ctx, input1_dev, input.size());
-    resize_memory_op()(gpu_ctx, input2_dev, input.size());
-    resize_memory_op()(gpu_ctx, output_dev, input.size());
+    resize_memory_op()(input1_dev, input.size());
+    resize_memory_op()(input2_dev, input.size());
+    resize_memory_op()(output_dev, input.size());
 
     // syn the input data in CPU to GPU
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, input1_dev, input1.data(), input.size());
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, input2_dev, input2.data(), input.size());
+    synchronize_memory_op()(input1_dev, input1.data(), input.size());
+    synchronize_memory_op()(input2_dev, input2.data(), input.size());
 
     // run
     constantvector_addORsub_constantVector_op_gpu()(gpu_ctx,
@@ -509,7 +509,7 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp
                                                     constant2);
 
     // syn the output data in GPU to CPU
-    synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size());
+    synchronize_memory_op_gpu()(output.data(), output_dev, output.size());
 
     for (int i = 0; i < input.size(); i++)
     {
@@ -517,9 +517,9 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp
         EXPECT_LT(fabs(output[i].real() - output_constantvector_addORsub_constantVector_op[i].real()), 1e-8);
     }
 
-    delete_memory_op()(gpu_ctx, input1_dev);
-    delete_memory_op()(gpu_ctx, input2_dev);
-    delete_memory_op()(gpu_ctx, output_dev);
+    delete_memory_op()(input1_dev);
+    delete_memory_op()(input2_dev);
+    delete_memory_op()(output_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu)
@@ -529,12 +529,12 @@ TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu)
     std::complex<double>* Y_axpy_dev = NULL;
 
     // resize memory for values in GPU
-    resize_memory_op()(gpu_ctx, X_axpy_dev, X_axpy.size());
-    resize_memory_op()(gpu_ctx, Y_axpy_dev, Y_axpy.size());
+    resize_memory_op()(X_axpy_dev, X_axpy.size());
+    resize_memory_op()(Y_axpy_dev, Y_axpy.size());
 
     // syn the input data in CPU to GPU
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, X_axpy_dev, X_axpy.data(), X_axpy.size());
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, Y_axpy_dev, Y_axpy.data(), Y_axpy.size());
+    synchronize_memory_op()(X_axpy_dev, X_axpy.data(), X_axpy.size());
+    synchronize_memory_op()(Y_axpy_dev, Y_axpy.data(), Y_axpy.size());
 
     // run
     hsolver::createGpuBlasHandle();
@@ -542,7 +542,7 @@ TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu)
     hsolver::destoryBLAShandle();
 
     // syn the output data in GPU to CPU
-    synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, Y_axpy.data(), Y_axpy_dev, Y_axpy.size());
+    synchronize_memory_op_gpu()(Y_axpy.data(), Y_axpy_dev, Y_axpy.size());
 
     for (int i = 0; i < input.size(); i++)
     {
@@ -550,8 +550,8 @@ TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu)
         EXPECT_LT(fabs(Y_axpy[i].real() - output_axpy_op[i].real()), 1e-8);
     }
 
-    delete_memory_op()(gpu_ctx, X_axpy_dev);
-    delete_memory_op()(gpu_ctx, Y_axpy_dev);
+    delete_memory_op()(X_axpy_dev);
+    delete_memory_op()(Y_axpy_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, scal_op_gpu)
@@ -560,10 +560,10 @@ TEST_F(TestModuleHsolverMathKernel, scal_op_gpu)
     std::complex<double>* X_scal_dev = NULL;
 
     // resize memory for values in GPU
-    resize_memory_op()(gpu_ctx, X_scal_dev, X_scal.size());
+    resize_memory_op()(X_scal_dev, X_scal.size());
 
     // syn the input data in CPU to GPU
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, X_scal_dev, X_scal.data(), X_scal.size());
+    synchronize_memory_op()(X_scal_dev, X_scal.data(), X_scal.size());
 
     // run
     hsolver::createGpuBlasHandle();
@@ -571,14 +571,14 @@ TEST_F(TestModuleHsolverMathKernel, scal_op_gpu)
     hsolver::destoryBLAShandle();
 
     // syn the output data in GPU to CPU
-    synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, X_scal.data(), X_scal_dev, X_scal.size());
+    synchronize_memory_op_gpu()(X_scal.data(), X_scal_dev, X_scal.size());
 
     for (int i = 0; i < input.size(); i++)
     {
         EXPECT_LT(fabs(X_scal[i].imag() - output_scal_op[i].imag()), 1e-8);
         EXPECT_LT(fabs(X_scal[i].real() - output_scal_op[i].real()), 1e-8);
     }
-    delete_memory_op()(gpu_ctx, X_scal_dev);
+    delete_memory_op()(X_scal_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu)
@@ -589,21 +589,21 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu)
     std::complex<double>* Y_gemv_dev = NULL;
 
     // resize memory for values in GPU
-    resize_memory_op()(gpu_ctx, A_gemv_dev, A_gemv.size());
-    resize_memory_op()(gpu_ctx, X_gemv_dev, X_gemv.size());
-    resize_memory_op()(gpu_ctx, Y_gemv_dev, Y_gemv.size());
+    resize_memory_op()(A_gemv_dev, A_gemv.size());
+    resize_memory_op()(X_gemv_dev, X_gemv.size());
+    resize_memory_op()(Y_gemv_dev, Y_gemv.size());
 
     // syn the input data in CPU to GPU
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, A_gemv_dev, A_gemv.data(), A_gemv.size());
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, X_gemv_dev, X_gemv.data(), X_gemv.size());
-    synchronize_memory_op()(gpu_ctx, cpu_ctx, Y_gemv_dev, Y_gemv.data(), Y_gemv.size());
+    synchronize_memory_op()(A_gemv_dev, A_gemv.data(), A_gemv.size());
+    synchronize_memory_op()(X_gemv_dev, X_gemv.data(), X_gemv.size());
+    synchronize_memory_op()(Y_gemv_dev, Y_gemv.data(), Y_gemv.size());
 
     // run
     hsolver::createGpuBlasHandle();
     gemv_op_gpu()(gpu_ctx, 'C', 2, 3, &ModuleBase::ONE, A_gemv_dev, 2, X_gemv_dev, 1, &ModuleBase::ONE, Y_gemv_dev, 1);
     hsolver::destoryBLAShandle();
     // syn the output data in GPU to CPU
-    synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, Y_gemv.data(), Y_gemv_dev, Y_gemv.size());
+    synchronize_memory_op_gpu()(Y_gemv.data(), Y_gemv_dev, Y_gemv.size());
 
     // cal right answer: Y_test_gemv
     char trans = 'C';
@@ -628,9 +628,9 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu)
         EXPECT_LT(fabs(Y_gemv[i].real() - Y_test_gemv[i].real()), 1e-12);
     }
 
-    delete_memory_op()(gpu_ctx, A_gemv_dev);
-    delete_memory_op()(gpu_ctx, X_gemv_dev);
-    delete_memory_op()(gpu_ctx, Y_gemv_dev);
+    delete_memory_op()(A_gemv_dev);
+    delete_memory_op()(X_gemv_dev);
+    delete_memory_op()(Y_gemv_dev);
 }
 
 TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu)
@@ -654,20 +654,16 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu)
     int LDB = 4;
 
     std::complex<double>* device_A = nullptr;
-    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, device_A, A.size());
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(device_A, A.size());
     base_device::memory::
-        synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(gpu_ctx,
-                                                                                                        cpu_ctx,
-                                                                                                        device_A,
+        synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(device_A,
                                                                                                         A.data(),
                                                                                                         A.size());
 
     std::complex<double>* device_B = nullptr;
-    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(gpu_ctx, device_B, B.size());
+    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(device_B, B.size());
     base_device::memory::
-        synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(gpu_ctx,
-                                                                                                        cpu_ctx,
-                                                                                                        device_B,
+        synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(device_B,
                                                                                                         B.data(),
                                                                                                         B.size());
 
@@ -682,9 +678,7 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu)
     std::vector<std::complex<double>> B_gpu2cpu(8);
     base_device::memory::synchronize_memory_op<std::complex<double>,
                                                base_device::DEVICE_CPU,
-                                               base_device::DEVICE_GPU>()(cpu_ctx,
-                                                                          gpu_ctx,
-                                                                          B_gpu2cpu.data(),
+                                               base_device::DEVICE_GPU>()(B_gpu2cpu.data(),
                                                                           device_B,
                                                                           B_gpu2cpu.size());
 
@@ -721,8 +715,8 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu)
         EXPECT_LT(fabs(B_gpu2cpu[i].real() - B_cpu[i].real()), 1e-12);
     }
 
-    delete_memory_op()(gpu_ctx, device_A);
-    delete_memory_op()(gpu_ctx, device_B);
+    delete_memory_op()(device_A);
+    delete_memory_op()(device_B);
 }
 
 #endif // __UT_USE_CUDA || __UT_USE_ROCM
diff --git a/source/module_hsolver/kernels/test/perf_math_kernel.cpp b/source/module_hsolver/kernels/test/perf_math_kernel.cpp
index 173ef8b40b..b2b0704a9d 100644
--- a/source/module_hsolver/kernels/test/perf_math_kernel.cpp
+++ b/source/module_hsolver/kernels/test/perf_math_kernel.cpp
@@ -105,14 +105,14 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture {
         zconstant_a = std::complex<double>{(double)rand()+(double)rand()/(RAND_MAX+1.0),(double)rand()+(double)rand()/(RAND_MAX+1.0)};
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 
-        resize_memory_op()(gpu_ctx, test_zvector_a_gpu, dim_vector);
-        resize_memory_op()(gpu_ctx, test_zvector_b_gpu, dim_vector);
-        synchronize_memory_op()(gpu_ctx, cpu_ctx, test_zvector_a_gpu, test_zvector_a, dim_vector);
-        synchronize_memory_op()(gpu_ctx, cpu_ctx, test_zvector_b_gpu, test_zvector_b, dim_vector);
-
-        resize_memory_op()(gpu_ctx, result_zvector_gpu, dim_vector);
-        resize_memory_op_double()(gpu_ctx, test_dvector_a_gpu, dim_vector);
-        synchronize_memory_op_double()(gpu_ctx, cpu_ctx, test_dvector_a_gpu, test_dvector_a, dim_vector);
+        resize_memory_op()(test_zvector_a_gpu, dim_vector);
+        resize_memory_op()(test_zvector_b_gpu, dim_vector);
+        synchronize_memory_op()(test_zvector_a_gpu, test_zvector_a, dim_vector);
+        synchronize_memory_op()(test_zvector_b_gpu, test_zvector_b, dim_vector);
+
+        resize_memory_op()(result_zvector_gpu, dim_vector);
+        resize_memory_op_double()(test_dvector_a_gpu, dim_vector);
+        synchronize_memory_op_double()(test_dvector_a_gpu, test_dvector_a, dim_vector);
 
         hsolver::createGpuBlasHandle();
 
diff --git a/source/module_hsolver/test/hsolver_pw_sup.h b/source/module_hsolver/test/hsolver_pw_sup.h
index fcb2862a29..c61ffaca7d 100644
--- a/source/module_hsolver/test/hsolver_pw_sup.h
+++ b/source/module_hsolver/test/hsolver_pw_sup.h
@@ -139,15 +139,13 @@ DiagoDavid<T, Device>::DiagoDavid(const Real* precondition_in,
 
 template <typename T, typename Device>
 DiagoDavid<T, Device>::~DiagoDavid() {
-    delmem_complex_op()(this->ctx, this->hpsi);
-    delmem_complex_op()(this->ctx, this->spsi);
-    delmem_complex_op()(this->ctx, this->hcc);
-    delmem_complex_op()(this->ctx, this->scc);
-    delmem_complex_op()(this->ctx, this->vcc);
-    delmem_complex_op()(this->ctx, this->lagrange_matrix);
-    base_device::memory::delete_memory_op<Real, base_device::DEVICE_CPU>()(
-        this->cpu_ctx,
-        this->eigenvalue);
+    delmem_complex_op()(this->hpsi);
+    delmem_complex_op()(this->spsi);
+    delmem_complex_op()(this->hcc);
+    delmem_complex_op()(this->scc);
+    delmem_complex_op()(this->vcc);
+    delmem_complex_op()(this->lagrange_matrix);
+    base_device::memory::delete_memory_op<Real, base_device::DEVICE_CPU>()(this->eigenvalue);
 }
 
 template <typename T, typename Device>
diff --git a/source/module_psi/psi.cpp b/source/module_psi/psi.cpp
index 7942b412c9..a69635dffb 100644
--- a/source/module_psi/psi.cpp
+++ b/source/module_psi/psi.cpp
@@ -40,7 +40,7 @@ Psi<T, Device>::~Psi()
 {
     if (this->allocate_inside)
     {
-        delete_memory_op()(this->ctx, this->psi);
+        delete_memory_op()(this->psi);
     }
 }
 
@@ -58,7 +58,7 @@ Psi<T, Device>::Psi(const int nk_in, const int nbd_in, const int nbs_in, const i
 
     this->ngk = ngk_in; // modify later
     // This function will delete the psi array first(if psi exist), then malloc a new memory for it.
-    resize_memory_op()(this->ctx, this->psi, nk_in * static_cast<std::size_t>(nbd_in) * nbs_in, "no_record");
+    resize_memory_op()(this->psi, nk_in * static_cast<std::size_t>(nbd_in) * nbs_in, "no_record");
 
     this->nk = nk_in;
     this->nbands = nbd_in;
@@ -96,7 +96,7 @@ Psi<T, Device>::Psi(const int nk_in,
 
     this->ngk = ngk_in.data(); // modify later
     // This function will delete the psi array first(if psi exist), then malloc a new memory for it.
-    resize_memory_op()(this->ctx, this->psi, nk_in * static_cast<std::size_t>(nbd_in) * nbs_in, "no_record");
+    resize_memory_op()(this->psi, nk_in * static_cast<std::size_t>(nbd_in) * nbs_in, "no_record");
 
     this->nk = nk_in;
     this->nbands = nbd_in;
@@ -166,7 +166,7 @@ Psi<T, Device>::Psi(const int nk_in,
 
     this->ngk = nullptr;
     assert(nk_in > 0 && nbd_in >= 0 && nbs_in > 0);
-    resize_memory_op()(this->ctx, this->psi, nk_in * static_cast<std::size_t>(nbd_in) * nbs_in, "no_record");
+    resize_memory_op()(this->psi, nk_in * static_cast<std::size_t>(nbd_in) * nbs_in, "no_record");
 
     this->nk = nk_in;
     this->nbands = nbd_in;
@@ -201,9 +201,7 @@ Psi<T, Device>::Psi(const Psi& psi_in)
     // this function will copy psi_in.psi to this->psi no matter the device types of each other.
 
     this->resize(psi_in.get_nk(), psi_in.get_nbands(), psi_in.get_nbasis());
-    base_device::memory::synchronize_memory_op<T, Device, Device>()(this->ctx,
-                                                                    psi_in.get_device(),
-                                                                    this->psi,
+    base_device::memory::synchronize_memory_op<T, Device, Device>()(this->psi,
                                                                     psi_in.get_pointer() - psi_in.get_psi_bias(),
                                                                     psi_in.size());
     this->psi_bias = psi_in.get_psi_bias();
@@ -238,25 +236,19 @@ Psi<T, Device>::Psi(const Psi<T_in, Device_in>& psi_in)
     {
         auto* arr = (T*)malloc(sizeof(T) * psi_in.size());
         // cast the memory from T_in to T in CPU
-        base_device::memory::cast_memory_op<T, T_in, Device_in, Device_in>()(psi_in.get_device(),
-                                                                             psi_in.get_device(),
-                                                                             arr,
+        base_device::memory::cast_memory_op<T, T_in, Device_in, Device_in>()(arr,
                                                                              psi_in.get_pointer()
                                                                                  - psi_in.get_psi_bias(),
                                                                              psi_in.size());
         // synchronize the memory from CPU to GPU
-        base_device::memory::synchronize_memory_op<T, Device, Device_in>()(this->ctx,
-                                                                           psi_in.get_device(),
-                                                                           this->psi,
+        base_device::memory::synchronize_memory_op<T, Device, Device_in>()(this->psi,
                                                                            arr,
                                                                            psi_in.size());
         free(arr);
     }
     else
     {
-        base_device::memory::cast_memory_op<T, T_in, Device, Device_in>()(this->ctx,
-                                                                          psi_in.get_device(),
-                                                                          this->psi,
+        base_device::memory::cast_memory_op<T, T_in, Device, Device_in>()(this->psi,
                                                                           psi_in.get_pointer() - psi_in.get_psi_bias(),
                                                                           psi_in.size());
     }
@@ -269,7 +261,7 @@ template <typename T, typename Device>
 void Psi<T, Device>::set_all_psi(const T* another_pointer, const std::size_t size_in)
 {
     assert(size_in == this->size());
-    synchronize_memory_op()(this->ctx, this->ctx, this->psi, another_pointer, this->size());
+    synchronize_memory_op()(this->psi, another_pointer, this->size());
 }
 
 template <typename T, typename Device>
@@ -278,7 +270,7 @@ void Psi<T, Device>::resize(const int nks_in, const int nbands_in, const int nba
     assert(nks_in > 0 && nbands_in >= 0 && nbasis_in > 0);
 
     // This function will delete the psi array first(if psi exist), then malloc a new memory for it.
-    resize_memory_op()(this->ctx, this->psi, nks_in * static_cast<std::size_t>(nbands_in) * nbasis_in, "no_record");
+    resize_memory_op()(this->psi, nks_in * static_cast<std::size_t>(nbands_in) * nbasis_in, "no_record");
 
     // this->zero_out();
 
@@ -496,7 +488,7 @@ template <typename T, typename Device>
 void Psi<T, Device>::zero_out()
 {
     // this->psi.assign(this->psi.size(), T(0));
-    set_memory_op()(this->ctx, this->psi, 0, this->size());
+    set_memory_op()(this->psi, 0, this->size());
 }
 
 template <typename T, typename Device>
diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp
index 2cdce4a5a8..102e2d4b1a 100644
--- a/source/module_psi/psi_init.cpp
+++ b/source/module_psi/psi_init.cpp
@@ -139,7 +139,7 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
         this->psi_initer->init_psig(psi_cpu->get_pointer(), ik);
         if (psi_device->get_pointer() != psi_cpu->get_pointer())
         {
-            syncmem_h2d_op()(ctx, cpu_ctx, psi_device->get_pointer(), psi_cpu->get_pointer(), nbands_start * nbasis);
+            syncmem_h2d_op()(psi_device->get_pointer(), psi_cpu->get_pointer(), nbands_start * nbasis);
         }
 
         std::vector<typename GetTypeReal<T>::type> etatom(nbands_start, 0.0);
@@ -170,7 +170,7 @@ void PSIInit<T, Device>::initialize_psi(Psi<std::complex<double>>* psi,
         {
             if (psi_device->get_pointer() != kspw_psi->get_pointer())
             {
-                syncmem_complex_op()(ctx, ctx, kspw_psi->get_pointer(), psi_device->get_pointer(), nbands * nbasis);
+                syncmem_complex_op()(kspw_psi->get_pointer(), psi_device->get_pointer(), nbands * nbasis);
             }
         }
     } // end k-point loop