diff --git a/python/pyabacus/src/hsolver/py_diago_cg.hpp b/python/pyabacus/src/hsolver/py_diago_cg.hpp index f1f84e9a77..f907e2e764 100644 --- a/python/pyabacus/src/hsolver/py_diago_cg.hpp +++ b/python/pyabacus/src/hsolver/py_diago_cg.hpp @@ -153,8 +153,6 @@ class PyDiagoCG const int nrow = ndim == 1 ? psi_in.NumElements() : psi_in.shape().dim_size(1); const int nbands = ndim == 1 ? 1 : psi_in.shape().dim_size(0); syncmem_z2z_h2h_op()( - this->ctx, - this->ctx, spsi_out.data>(), psi_in.data>(), static_cast(nrow * nbands) diff --git a/python/pyabacus/src/hsolver/py_diago_david.hpp b/python/pyabacus/src/hsolver/py_diago_david.hpp index 8a8d2c727e..7087af632e 100644 --- a/python/pyabacus/src/hsolver/py_diago_david.hpp +++ b/python/pyabacus/src/hsolver/py_diago_david.hpp @@ -135,7 +135,7 @@ class PyDiagoDavid const int nrow, const int nbands ) { - syncmem_op()(this->ctx, this->ctx, spsi_out, psi_in, static_cast(nbands * nrow)); + syncmem_op()(spsi_out, psi_in, static_cast(nbands * nrow)); }; obj = std::make_unique, base_device::DEVICE_CPU>>( diff --git a/source/module_base/kernels/dsp/dsp_connector.h b/source/module_base/kernels/dsp/dsp_connector.h index b51c67663e..ea0d17749e 100644 --- a/source/module_base/kernels/dsp/dsp_connector.h +++ b/source/module_base/kernels/dsp/dsp_connector.h @@ -75,7 +75,7 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv auto* swap = new T[notconv * nbase_x]; auto* target = new T[notconv * nbase_x]; - syncmem_complex_op()(cpu_ctx, cpu_ctx, swap, hcc + nbase * nbase_x, notconv * nbase_x); + syncmem_complex_op()(swap, hcc + nbase * nbase_x, notconv * nbase_x); if (base_device::get_current_precision(swap) == "single") { MPI_Reduce(swap, @@ -97,8 +97,8 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv diag_comm); } - syncmem_complex_op()(cpu_ctx, cpu_ctx, hcc + nbase * nbase_x, target, notconv * nbase_x); - syncmem_complex_op()(cpu_ctx, cpu_ctx, swap, scc + nbase * nbase_x, notconv * nbase_x); + syncmem_complex_op()(hcc + nbase * nbase_x, target, notconv * nbase_x); + syncmem_complex_op()(swap, scc + nbase * nbase_x, notconv * nbase_x); if (base_device::get_current_precision(swap) == "single") { @@ -121,7 +121,7 @@ void dsp_dav_subspace_reduce(T* hcc, T* scc, int nbase, int nbase_x, int notconv diag_comm); } - syncmem_complex_op()(cpu_ctx, cpu_ctx, scc + nbase * nbase_x, target, notconv * nbase_x); + syncmem_complex_op()(scc + nbase * nbase_x, target, notconv * nbase_x); delete[] swap; delete[] target; } diff --git a/source/module_base/kernels/test/math_op_test.cpp b/source/module_base/kernels/test/math_op_test.cpp index 7136ab8d35..cfdedb234e 100644 --- a/source/module_base/kernels/test/math_op_test.cpp +++ b/source/module_base/kernels/test/math_op_test.cpp @@ -306,13 +306,13 @@ TEST_F(TestModuleBaseMathMultiDevice, cal_ylm_real_op_gpu) std::vector ylm(expected_ylm.size(), 0.0); double * d_ylm = nullptr, * d_g = nullptr, * d_p = nullptr; - resmem_var_op()(gpu_ctx, d_g, g.size()); - resmem_var_op()(gpu_ctx, d_p, p.size()); - resmem_var_op()(gpu_ctx, d_ylm, ylm.size()); + resmem_var_op()(d_g, g.size()); + resmem_var_op()(d_p, p.size()); + resmem_var_op()(d_ylm, ylm.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_g, g.data(), g.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_p, p.data(), p.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_ylm, ylm.data(), ylm.size()); + syncmem_var_h2d_op()(d_g, g.data(), g.size()); + syncmem_var_h2d_op()(d_p, p.data(), p.size()); + syncmem_var_h2d_op()(d_ylm, ylm.data(), ylm.size()); ModuleBase::cal_ylm_real_op()(gpu_ctx, ng, @@ -326,15 +326,15 @@ TEST_F(TestModuleBaseMathMultiDevice, cal_ylm_real_op_gpu) d_p, d_ylm); - syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, ylm.data(), d_ylm, ylm.size()); + syncmem_var_d2h_op()(ylm.data(), d_ylm, ylm.size()); for (int ii = 0; ii < ylm.size(); ii++) { EXPECT_LT(fabs(ylm[ii] - expected_ylm[ii]), 6e-5); } - delmem_var_op()(gpu_ctx, d_g); - delmem_var_op()(gpu_ctx, d_p); - delmem_var_op()(gpu_ctx, d_ylm); + delmem_var_op()(d_g); + delmem_var_op()(d_p); + delmem_var_op()(d_ylm); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_base/math_chebyshev.cpp b/source/module_base/math_chebyshev.cpp index 9bfac7cac9..b2cc6aadea 100644 --- a/source/module_base/math_chebyshev.cpp +++ b/source/module_base/math_chebyshev.cpp @@ -63,8 +63,8 @@ Chebyshev::Chebyshev(const int norder_in) : fftw(2 * EXTEND * nord coefc_cpu = new std::complex[norder]; if (base_device::get_device_type(this->ctx) == base_device::GpuDevice) { - resmem_var_op()(this->ctx, this->coef_real, norder); - resmem_complex_op()(this->ctx, this->coef_complex, norder); + resmem_var_op()(this->coef_real, norder); + resmem_complex_op()(this->coef_complex, norder); } else { @@ -84,8 +84,8 @@ Chebyshev::~Chebyshev() delete[] polytrace; if (base_device::get_device_type(this->ctx) == base_device::GpuDevice) { - delmem_var_op()(this->ctx, this->coef_real); - delmem_complex_op()(this->ctx, this->coef_complex); + delmem_var_op()(this->coef_real); + delmem_complex_op()(this->coef_complex); } else { @@ -129,10 +129,10 @@ REAL Chebyshev::ddot_real(const std::complex* psi_L, pL = (REAL*)psi_L; pR = (REAL*)psi_R; REAL* dot_device = nullptr; - resmem_var_op()(this->ctx, dot_device, 1); + resmem_var_op()(dot_device, 1); container::kernels::blas_dot()(dim2, pL, 1, pR, 1, dot_device); - syncmem_var_d2h_op()(cpu_ctx, this->ctx, &result, dot_device, 1); - delmem_var_op()(this->ctx, dot_device); + syncmem_var_d2h_op()(&result, dot_device, 1); + delmem_var_op()(dot_device); } else { @@ -140,18 +140,18 @@ REAL Chebyshev::ddot_real(const std::complex* psi_L, pL = (REAL*)psi_L; pR = (REAL*)psi_R; REAL* dot_device = nullptr; - resmem_var_op()(this->ctx, dot_device, 1); + resmem_var_op()(dot_device, 1); for (int i = 0; i < m; ++i) { int dim2 = 2 * N; container::kernels::blas_dot()(dim2, pL, 1, pR, 1, dot_device); REAL result_temp = 0; - syncmem_var_d2h_op()(cpu_ctx, this->ctx, &result_temp, dot_device, 1); + syncmem_var_d2h_op()(&result_temp, dot_device, 1); result += result_temp; pL += 2 * LDA; pR += 2 * LDA; } - delmem_var_op()(this->ctx, dot_device); + delmem_var_op()(dot_device); } return result; } @@ -211,7 +211,7 @@ void Chebyshev::calcoef_real(std::function fun) if (base_device::get_device_type(this->ctx) == base_device::GpuDevice) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, coef_real, coefr_cpu, norder); + syncmem_var_h2d_op()(coef_real, coefr_cpu, norder); } getcoef_real = true; @@ -301,7 +301,7 @@ void Chebyshev::calcoef_complex(std::function(s } if (base_device::get_device_type(this->ctx) == base_device::GpuDevice) { - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, coef_complex, coefc_cpu, norder); + syncmem_complex_h2d_op()(coef_complex, coefc_cpu, norder); } getcoef_complex = true; @@ -392,7 +392,7 @@ void Chebyshev::calcoef_pair(std::function fun1, std:: if (base_device::get_device_type(this->ctx) == base_device::GpuDevice) { - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, coef_complex, coefc_cpu, norder); + syncmem_complex_h2d_op()(coef_complex, coefc_cpu, norder); } getcoef_complex = true; @@ -427,17 +427,17 @@ void Chebyshev::calfinalvec_real( ndmxt = LDA * m; } - resmem_complex_op()(this->ctx, arraynp1, ndmxt); - resmem_complex_op()(this->ctx, arrayn, ndmxt); - resmem_complex_op()(this->ctx, arrayn_1, ndmxt); + resmem_complex_op()(arraynp1, ndmxt); + resmem_complex_op()(arrayn, ndmxt); + resmem_complex_op()(arrayn_1, ndmxt); - memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, ndmxt); + memcpy_complex_op()(arrayn_1, wavein, ndmxt); // ModuleBase::GlobalFunc::DCOPY(wavein, arrayn_1, ndmxt); funA(arrayn_1, arrayn, m); // 0- & 1-st order - setmem_complex_op()(this->ctx, waveout, 0, ndmxt); + setmem_complex_op()(waveout, 0, ndmxt); std::complex coef0 = std::complex(coefr_cpu[0], 0); container::kernels::blas_axpy, ct_Device>()(ndmxt, &coef0, arrayn_1, 1, waveout, 1); std::complex coef1 = std::complex(coefr_cpu[1], 0); @@ -462,9 +462,9 @@ void Chebyshev::calfinalvec_real( arrayn = arraynp1; arraynp1 = tem; } - delmem_complex_op()(this->ctx, arraynp1); - delmem_complex_op()(this->ctx, arrayn); - delmem_complex_op()(this->ctx, arrayn_1); + delmem_complex_op()(arraynp1); + delmem_complex_op()(arrayn); + delmem_complex_op()(arrayn_1); return; } @@ -496,16 +496,16 @@ void Chebyshev::calfinalvec_complex( ndmxt = LDA * m; } - resmem_complex_op()(this->ctx, arraynp1, ndmxt); - resmem_complex_op()(this->ctx, arrayn, ndmxt); - resmem_complex_op()(this->ctx, arrayn_1, ndmxt); + resmem_complex_op()(arraynp1, ndmxt); + resmem_complex_op()(arrayn, ndmxt); + resmem_complex_op()(arrayn_1, ndmxt); - memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, ndmxt); + memcpy_complex_op()(arrayn_1, wavein, ndmxt); funA(arrayn_1, arrayn, m); // 0- & 1-st order - setmem_complex_op()(this->ctx, waveout, 0, ndmxt); + setmem_complex_op()(waveout, 0, ndmxt); container::kernels::blas_axpy, ct_Device>()(ndmxt, &coefc_cpu[0], arrayn_1, 1, waveout, 1); container::kernels::blas_axpy, ct_Device>()(ndmxt, &coefc_cpu[1], arrayn, 1, waveout, 1); // for (int i = 0; i < ndmxt; ++i) @@ -527,9 +527,9 @@ void Chebyshev::calfinalvec_complex( arrayn = arraynp1; arraynp1 = tem; } - delmem_complex_op()(this->ctx, arraynp1); - delmem_complex_op()(this->ctx, arrayn); - delmem_complex_op()(this->ctx, arrayn_1); + delmem_complex_op()(arraynp1); + delmem_complex_op()(arrayn); + delmem_complex_op()(arrayn_1); return; } @@ -553,7 +553,7 @@ void Chebyshev::calpolyvec_complex( std::complex*tmpin = wavein, *tmpout = arrayn_1; for (int i = 0; i < m; ++i) { - memcpy_complex_op()(this->ctx, this->ctx, tmpout, tmpin, N); + memcpy_complex_op()(tmpout, tmpin, N); // ModuleBase::GlobalFunc::DCOPY(tmpin, tmpout, N); tmpin += LDA; tmpout += LDA; @@ -595,11 +595,11 @@ void Chebyshev::tracepolyA( ndmxt = LDA * m; } - resmem_complex_op()(this->ctx, arraynp1, ndmxt); - resmem_complex_op()(this->ctx, arrayn, ndmxt); - resmem_complex_op()(this->ctx, arrayn_1, ndmxt); + resmem_complex_op()(arraynp1, ndmxt); + resmem_complex_op()(arrayn, ndmxt); + resmem_complex_op()(arrayn_1, ndmxt); - memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, ndmxt); + memcpy_complex_op()(arrayn_1, wavein, ndmxt); // ModuleBase::GlobalFunc::DCOPY(wavein, arrayn_1, ndmxt); funA(arrayn_1, arrayn, m); @@ -618,9 +618,9 @@ void Chebyshev::tracepolyA( arraynp1 = tem; } - delmem_complex_op()(this->ctx, arraynp1); - delmem_complex_op()(this->ctx, arrayn); - delmem_complex_op()(this->ctx, arrayn_1); + delmem_complex_op()(arraynp1); + delmem_complex_op()(arrayn); + delmem_complex_op()(arrayn_1); return; } @@ -669,11 +669,11 @@ bool Chebyshev::checkconverge( std::complex* arrayn = nullptr; std::complex* arrayn_1 = nullptr; - resmem_complex_op()(this->ctx, arraynp1, LDA); - resmem_complex_op()(this->ctx, arrayn, LDA); - resmem_complex_op()(this->ctx, arrayn_1, LDA); + resmem_complex_op()(arraynp1, LDA); + resmem_complex_op()(arrayn, LDA); + resmem_complex_op()(arrayn_1, LDA); - memcpy_complex_op()(this->ctx, this->ctx, arrayn_1, wavein, N); + memcpy_complex_op()(arrayn_1, wavein, N); // ModuleBase::GlobalFunc::DCOPY(wavein, arrayn_1, N); if (tmin == tmax) @@ -754,9 +754,9 @@ bool Chebyshev::checkconverge( arraynp1 = tem; } - delmem_complex_op()(this->ctx, arraynp1); - delmem_complex_op()(this->ctx, arrayn); - delmem_complex_op()(this->ctx, arrayn_1); + delmem_complex_op()(arraynp1); + delmem_complex_op()(arrayn); + delmem_complex_op()(arrayn_1); return converge; } diff --git a/source/module_base/math_ylmreal.cpp b/source/module_base/math_ylmreal.cpp index 953112996a..fac76cf959 100644 --- a/source/module_base/math_ylmreal.cpp +++ b/source/module_base/math_ylmreal.cpp @@ -327,7 +327,7 @@ void YlmReal::Ylm_Real(Device * ctx, const int lmax2, const int ng, const FPTYPE ModuleBase::WARNING_QUIT("YLM_REAL","l>30 or l<0"); } FPTYPE * p = nullptr, * phi = nullptr, * cost = nullptr; - resmem_var_op()(ctx, p, (lmax + 1) * (lmax + 1) * ng, "YlmReal::Ylm_Real"); + resmem_var_op()(p, (lmax + 1) * (lmax + 1) * ng, "YlmReal::Ylm_Real"); cal_ylm_real_op()( ctx, @@ -342,9 +342,9 @@ void YlmReal::Ylm_Real(Device * ctx, const int lmax2, const int ng, const FPTYPE p, ylm); - delmem_var_op()(ctx, p); - delmem_var_op()(ctx, phi); - delmem_var_op()(ctx, cost); + delmem_var_op()(p); + delmem_var_op()(phi); + delmem_var_op()(cost); } // end subroutine ylmr2 //========================================================== diff --git a/source/module_base/module_device/cuda/memory_op.cu b/source/module_base/module_device/cuda/memory_op.cu index bc9384c446..c4f9efdb42 100644 --- a/source/module_base/module_device/cuda/memory_op.cu +++ b/source/module_base/module_device/cuda/memory_op.cu @@ -52,14 +52,13 @@ __global__ void cast_memory(std::complex* out, const FPTYPE_in* in, } template -void resize_memory_op::operator()(const base_device::DEVICE_GPU* dev, - FPTYPE*& arr, +void resize_memory_op::operator()(FPTYPE*& arr, const size_t size, const char* record_in) { if (arr != nullptr) { - delete_memory_op()(dev, arr); + delete_memory_op()(arr); } cudaErrcheck(cudaMalloc((void**)&arr, sizeof(FPTYPE) * size)); std::string record_string; @@ -79,8 +78,7 @@ void resize_memory_op::operator()(const base_de } template -void set_memory_op::operator()(const base_device::DEVICE_GPU* dev, - FPTYPE* arr, +void set_memory_op::operator()(FPTYPE* arr, const int var, const size_t size) { @@ -89,8 +87,6 @@ void set_memory_op::operator()(const base_devic template void synchronize_memory_op::operator()( - const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_GPU* dev_in, FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) @@ -100,8 +96,6 @@ void synchronize_memory_op void synchronize_memory_op::operator()( - const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_CPU* dev_in, FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) @@ -111,8 +105,6 @@ void synchronize_memory_op void synchronize_memory_op::operator()( - const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_GPU* dev_in, FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) @@ -123,9 +115,7 @@ void synchronize_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -142,9 +132,7 @@ struct cast_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_CPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -152,9 +140,7 @@ struct cast_memory_op::value) { - synchronize_memory_op()(dev_out, - dev_in, - arr_out, + synchronize_memory_op()(arr_out, reinterpret_cast(arr_in), size); return; @@ -171,18 +157,14 @@ struct cast_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { if (size == 0) {return;} // No need to cast the memory if the data types are the same. if (std::is_same::value) { - synchronize_memory_op()(dev_out, - dev_in, - arr_out, + synchronize_memory_op()(arr_out, reinterpret_cast(arr_in), size); return; @@ -197,7 +179,7 @@ struct cast_memory_op -void delete_memory_op::operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr) +void delete_memory_op::operator()(FPTYPE* arr) { cudaErrcheck(cudaFree(arr)); } diff --git a/source/module_base/module_device/memory_op.cpp b/source/module_base/module_device/memory_op.cpp index 3c807dfad7..525ecee89f 100644 --- a/source/module_base/module_device/memory_op.cpp +++ b/source/module_base/module_device/memory_op.cpp @@ -18,7 +18,7 @@ namespace memory template struct resize_memory_op { - void operator()(const base_device::DEVICE_CPU* dev, FPTYPE*& arr, const size_t size, const char* record_in) + void operator()(FPTYPE*& arr, const size_t size, const char* record_in) { if (arr != nullptr) { @@ -45,7 +45,7 @@ struct resize_memory_op template struct set_memory_op { - void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr, const int var, const size_t size) + void operator()(FPTYPE* arr, const int var, const size_t size) { ModuleBase::OMP_PARALLEL([&](int num_thread, int thread_id) { int beg = 0, len = 0; @@ -58,9 +58,7 @@ struct set_memory_op template struct synchronize_memory_op { - void operator()(const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_CPU* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) { @@ -75,9 +73,7 @@ struct synchronize_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_CPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -94,7 +90,7 @@ struct cast_memory_op struct delete_memory_op { - void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr) + void operator()(FPTYPE* arr) { free(arr); } @@ -156,8 +152,7 @@ template struct delete_memory_op*, base_device::DEVICE_CPU> template struct resize_memory_op { - void operator()(const base_device::DEVICE_GPU* dev, - FPTYPE*& arr, + void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr) { @@ -167,7 +162,7 @@ struct resize_memory_op template struct set_memory_op { - void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr, const int var, const size_t size) + void operator()(FPTYPE* arr, const int var, const size_t size) { } }; @@ -175,9 +170,7 @@ struct set_memory_op template struct synchronize_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) { @@ -187,9 +180,7 @@ struct synchronize_memory_op struct synchronize_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_CPU* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) { @@ -199,9 +190,7 @@ struct synchronize_memory_op struct synchronize_memory_op { - void operator()(const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) { @@ -211,9 +200,7 @@ struct synchronize_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -223,9 +210,7 @@ struct cast_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_CPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -235,9 +220,7 @@ struct cast_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -247,7 +230,7 @@ struct cast_memory_op struct delete_memory_op { - void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr) + void operator()(FPTYPE* arr) { } }; @@ -353,7 +336,7 @@ template struct delete_memory_op, base_device::DEVICE_GPU>; template struct resize_memory_op_mt { - void operator()(const base_device::DEVICE_CPU* dev, FPTYPE*& arr, const size_t size, const char* record_in) + void operator()(FPTYPE*& arr, const size_t size, const char* record_in) { if (arr != nullptr) { @@ -380,7 +363,7 @@ struct resize_memory_op_mt template struct delete_memory_op_mt { - void operator()(const base_device::DEVICE_CPU* dev, FPTYPE* arr) + void operator()(FPTYPE* arr) { free_ht(arr); } @@ -401,39 +384,39 @@ template struct delete_memory_op_mt, base_device::DEVICE_CP #endif template -void resize_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type) +void resize_memory(FPTYPE* arr, const size_t size, base_device::AbacusDevice_t device_type) { if (device_type == base_device::AbacusDevice_t::CpuDevice){ - resize_memory_op()(cpu_ctx, arr); + resize_memory_op()(arr, size); } else if (device_type == base_device::AbacusDevice_t::GpuDevice){ - resize_memory_op()(gpu_ctx, arr); + resize_memory_op()(arr, size); } } template void set_memory(FPTYPE* arr, const int var, const size_t size, base_device::AbacusDevice_t device_type){ if (device_type == base_device::AbacusDevice_t::CpuDevice){ - set_memory_op()(cpu_ctx, arr, var, size); + set_memory_op()(arr, var, size); } else if (device_type == base_device::AbacusDevice_t::GpuDevice){ - set_memory_op()(gpu_ctx, arr, var, size); + set_memory_op()(arr, var, size); } } template void synchronize_memory(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size, base_device::AbacusDevice_t device_type_out, base_device::AbacusDevice_t device_type_in){ if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){ - synchronize_memory_op()(cpu_ctx, cpu_ctx, arr_out, arr_in, size); + synchronize_memory_op()(arr_out, arr_in, size); } else if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){ - synchronize_memory_op()(cpu_ctx, gpu_ctx, arr_out, arr_in, size); + synchronize_memory_op()(arr_out, arr_in, size); } else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){ - synchronize_memory_op()(gpu_ctx, cpu_ctx, arr_out, arr_in, size); + synchronize_memory_op()(arr_out, arr_in, size); } else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){ - synchronize_memory_op()(gpu_ctx, gpu_ctx, arr_out, arr_in, size); + synchronize_memory_op()(arr_out, arr_in, size); } } @@ -441,16 +424,16 @@ template void cast_memory(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size, base_device::AbacusDevice_t device_type_out, base_device::AbacusDevice_t device_type_in) { if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){ - cast_memory_op()(cpu_ctx, cpu_ctx, arr_out, arr_in, size); + cast_memory_op()(arr_out, arr_in, size); } else if (device_type_out == base_device::AbacusDevice_t::CpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){ - cast_memory_op()(cpu_ctx, gpu_ctx, arr_out, arr_in, size); + cast_memory_op()(arr_out, arr_in, size); } else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::CpuDevice){ - cast_memory_op()(gpu_ctx, cpu_ctx, arr_out, arr_in, size); + cast_memory_op()(arr_out, arr_in, size); } else if (device_type_out == base_device::AbacusDevice_t::GpuDevice || device_type_in == base_device::AbacusDevice_t::GpuDevice){ - cast_memory_op()(gpu_ctx, gpu_ctx, arr_out, arr_in, size); + cast_memory_op()(arr_out, arr_in, size); } } @@ -458,10 +441,10 @@ template void delete_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type) { if (device_type == base_device::AbacusDevice_t::CpuDevice){ - delete_memory_op()(cpu_ctx, arr); + delete_memory_op()(arr); } else if (device_type == base_device::AbacusDevice_t::GpuDevice){ - delete_memory_op()(gpu_ctx, arr); + delete_memory_op()(arr); } } diff --git a/source/module_base/module_device/memory_op.h b/source/module_base/module_device/memory_op.h index 14926caf9b..e09294d970 100644 --- a/source/module_base/module_device/memory_op.h +++ b/source/module_base/module_device/memory_op.h @@ -18,13 +18,12 @@ struct resize_memory_op /// @brief Allocate memory for a given pointer. Note this op will free the pointer first. /// /// Input Parameters - /// \param dev : the type of computing device /// \param size : array size /// \param record_string : label for memory record /// /// Output Parameters /// \param arr : allocated array - void operator()(const Device* dev, FPTYPE*& arr, const size_t size, const char* record_in = nullptr); + void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr); }; template @@ -33,13 +32,12 @@ struct set_memory_op /// @brief memset for multi-device /// /// Input Parameters - /// \param dev : the type of computing device /// \param var : the specified constant value /// \param size : array size /// /// Output Parameters /// \param arr : output array initialized by the input value - void operator()(const Device* dev, FPTYPE* arr, const int var, const size_t size); + void operator()(FPTYPE* arr, const int var, const size_t size); }; template @@ -48,16 +46,12 @@ struct synchronize_memory_op /// @brief memcpy for multi-device /// /// Input Parameters - /// \param dev_out : the type of computing device of arr_out - /// \param dev_in : the type of computing device of arr_in /// \param arr_in : input array /// \param size : array size /// /// Output Parameters /// \param arr_out : output array initialized by the input array - void operator()(const Device_out* dev_out, - const Device_in* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size); }; @@ -68,16 +62,12 @@ struct cast_memory_op /// @brief memcpy for multi-device /// /// Input Parameters - /// \param dev_out : the type of computing device of arr_out - /// \param dev_in : the type of computing device of arr_in /// \param arr_in : input array /// \param size : array size /// /// Output Parameters /// \param arr_out : output array initialized by the input array - void operator()(const Device_out* dev_out, - const Device_in* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size); }; @@ -88,13 +78,12 @@ struct delete_memory_op /// @brief free memory for multi-device /// /// Input Parameters - /// \param dev : the type of computing device /// \param arr : the input array - void operator()(const Device* dev, FPTYPE* arr); + void operator()(FPTYPE* arr); }; template -void resize_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); +void resize_memory(FPTYPE* arr, const size_t size, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); template void set_memory(FPTYPE* arr, const int var, const size_t size, base_device::AbacusDevice_t device_type = base_device::AbacusDevice_t::CpuDevice); @@ -113,8 +102,7 @@ void delete_memory(FPTYPE* arr, base_device::AbacusDevice_t device_type = base_d template struct resize_memory_op { - void operator()(const base_device::DEVICE_GPU* dev, - FPTYPE*& arr, + void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr); }; @@ -122,33 +110,27 @@ struct resize_memory_op template struct set_memory_op { - void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr, const int var, const size_t size); + void operator()(FPTYPE* arr, const int var, const size_t size); }; template struct synchronize_memory_op { - void operator()(const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size); }; template struct synchronize_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_CPU* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size); }; template struct synchronize_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE* arr_out, + void operator()(FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size); }; @@ -156,7 +138,7 @@ struct synchronize_memory_op struct delete_memory_op { - void operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr); + void operator()(FPTYPE* arr); }; #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM @@ -168,13 +150,12 @@ struct resize_memory_op_mt /// @brief Allocate memory for a given pointer. Note this op will free the pointer first. /// /// Input Parameters - /// \param dev : the type of computing device /// \param size : array size /// \param record_string : label for memory record /// /// Output Parameters /// \param arr : allocated array - void operator()(const Device* dev, FPTYPE*& arr, const size_t size, const char* record_in = nullptr); + void operator()(FPTYPE*& arr, const size_t size, const char* record_in = nullptr); }; template @@ -183,9 +164,8 @@ struct delete_memory_op_mt /// @brief free memory for multi-device /// /// Input Parameters - /// \param dev : the type of computing device /// \param arr : the input array - void operator()(const Device* dev, FPTYPE* arr); + void operator()(FPTYPE* arr); }; #endif // __DSP diff --git a/source/module_base/module_device/rocm/memory_op.hip.cu b/source/module_base/module_device/rocm/memory_op.hip.cu index 1909cfb771..7e4cf7f497 100644 --- a/source/module_base/module_device/rocm/memory_op.hip.cu +++ b/source/module_base/module_device/rocm/memory_op.hip.cu @@ -39,21 +39,19 @@ __global__ void cast_memory(std::complex* out, const std::complex -void resize_memory_op::operator()(const base_device::DEVICE_GPU* dev, - FPTYPE*& arr, +void resize_memory_op::operator()(FPTYPE*& arr, const size_t size, const char* record_in) { if (arr != nullptr) { - delete_memory_op()(dev, arr); + delete_memory_op()(arr); } hipErrcheck(hipMalloc((void**)&arr, sizeof(FPTYPE) * size)); } template -void set_memory_op::operator()(const base_device::DEVICE_GPU* dev, - FPTYPE* arr, +void set_memory_op::operator()(FPTYPE* arr, const int var, const size_t size) { @@ -62,8 +60,6 @@ void set_memory_op::operator()(const base_devic template void synchronize_memory_op::operator()( - const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_GPU* dev_in, FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) @@ -73,8 +69,6 @@ void synchronize_memory_op void synchronize_memory_op::operator()( - const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_CPU* dev_in, FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) @@ -84,8 +78,6 @@ void synchronize_memory_op void synchronize_memory_op::operator()( - const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_GPU* dev_in, FPTYPE* arr_out, const FPTYPE* arr_in, const size_t size) @@ -95,9 +87,7 @@ void synchronize_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -110,9 +100,7 @@ struct cast_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_GPU* dev_out, - const base_device::DEVICE_CPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -139,9 +127,7 @@ struct cast_memory_op struct cast_memory_op { - void operator()(const base_device::DEVICE_CPU* dev_out, - const base_device::DEVICE_GPU* dev_in, - FPTYPE_out* arr_out, + void operator()(FPTYPE_out* arr_out, const FPTYPE_in* arr_in, const size_t size) { @@ -166,7 +152,7 @@ struct cast_memory_op -void delete_memory_op::operator()(const base_device::DEVICE_GPU* dev, FPTYPE* arr) +void delete_memory_op::operator()(FPTYPE* arr) { hipErrcheck(hipFree(arr)); } diff --git a/source/module_base/module_device/test/memory_test.cpp b/source/module_base/module_device/test/memory_test.cpp index 6dc45e5091..39c5c25d52 100644 --- a/source/module_base/module_device/test/memory_test.cpp +++ b/source/module_base/module_device/test/memory_test.cpp @@ -91,7 +91,7 @@ class TestModulePsiMemory : public ::testing::Test TEST_F(TestModulePsiMemory, set_memory_op_double_cpu) { std::vector v_xx = xx; - set_memory_double_cpu_op()(cpu_ctx, v_xx.data(), 0, xx.size()); + set_memory_double_cpu_op()(v_xx.data(), 0, xx.size()); for (int ii = 0; ii < xx.size(); ii++) { EXPECT_EQ(v_xx[ii], 0.0); @@ -101,7 +101,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_double_cpu) TEST_F(TestModulePsiMemory, set_memory_op_complex_double_cpu) { std::vector> vz_xx = z_xx; - set_memory_complex_double_cpu_op()(cpu_ctx, vz_xx.data(), 0, z_xx.size()); + set_memory_complex_double_cpu_op()(vz_xx.data(), 0, z_xx.size()); for (int ii = 0; ii < z_xx.size(); ii++) { EXPECT_EQ(vz_xx[ii], std::complex(0.0, 0.0)); @@ -111,7 +111,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_complex_double_cpu) TEST_F(TestModulePsiMemory, resize_memory_op_double_cpu) { double* xx_tmp = NULL; - resize_memory_double_cpu_op()(cpu_ctx, xx_tmp, xx.size()); + resize_memory_double_cpu_op()(xx_tmp, xx.size()); for (int ii = 0; ii < xx.size(); ii++) { xx_tmp[ii] = xx[ii]; @@ -126,7 +126,7 @@ TEST_F(TestModulePsiMemory, resize_memory_op_double_cpu) TEST_F(TestModulePsiMemory, resize_memory_op_comlex_double_cpu) { std::complex* z_xx_tmp = NULL; - resize_memory_comlex_double_cpu_op()(cpu_ctx, z_xx_tmp, z_xx.size()); + resize_memory_comlex_double_cpu_op()(z_xx_tmp, z_xx.size()); for (int ii = 0; ii < z_xx.size(); ii++) { z_xx_tmp[ii] = z_xx[ii]; @@ -141,7 +141,7 @@ TEST_F(TestModulePsiMemory, resize_memory_op_comlex_double_cpu) TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_cpu) { std::vector h_xx(xx.size(), 0); - synchronize_memory_double_cpu_to_cpu_op()(cpu_ctx, cpu_ctx, h_xx.data(), xx.data(), xx.size()); + synchronize_memory_double_cpu_to_cpu_op()(h_xx.data(), xx.data(), xx.size()); for (int ii = 0; ii < z_xx.size(); ii++) { EXPECT_EQ(h_xx[ii], xx[ii]); @@ -151,7 +151,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_cpu) TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_cpu) { std::vector> hz_xx(z_xx.size(), std::complex(0, 0)); - synchronize_memory_complex_double_cpu_to_cpu_op()(cpu_ctx, cpu_ctx, hz_xx.data(), z_xx.data(), z_xx.size()); + synchronize_memory_complex_double_cpu_to_cpu_op()(hz_xx.data(), z_xx.data(), z_xx.size()); for (int ii = 0; ii < z_xx.size(); ii++) { EXPECT_EQ(hz_xx[ii], z_xx[ii]); @@ -161,13 +161,13 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_cpu) TEST_F(TestModulePsiMemory, delete_memory_op_double_cpu) { double* h_xx = (double*)malloc(sizeof(double) * xx.size()); - delete_memory_double_cpu_op()(cpu_ctx, h_xx); + delete_memory_double_cpu_op()(h_xx); } TEST_F(TestModulePsiMemory, delete_memory_op_complex_double_cpu) { std::complex* hz_xx = (std::complex*)malloc(sizeof(std::complex) * z_xx.size()); - delete_memory_complex_double_cpu_op()(cpu_ctx, hz_xx); + delete_memory_complex_double_cpu_op()(hz_xx); } #if __UT_USE_CUDA || __UT_USE_ROCM @@ -175,7 +175,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_double_gpu) { thrust::device_ptr d_xx = thrust::device_malloc(xx.size()); thrust::copy(xx.begin(), xx.end(), d_xx); - set_memory_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(d_xx), 0, xx.size()); + set_memory_double_gpu_op()(thrust::raw_pointer_cast(d_xx), 0, xx.size()); thrust::host_vector h_xx(xx.size()); thrust::copy(d_xx, d_xx + xx.size(), h_xx.begin()); for (int ii = 0; ii < xx.size(); ii++) @@ -188,7 +188,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_complex_double_gpu) { thrust::device_ptr> dz_xx = thrust::device_malloc>(z_xx.size()); thrust::copy(z_xx.begin(), z_xx.end(), dz_xx); - set_memory_complex_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(dz_xx), 0, z_xx.size()); + set_memory_complex_double_gpu_op()(thrust::raw_pointer_cast(dz_xx), 0, z_xx.size()); thrust::host_vector> h_xx(z_xx.size()); thrust::copy(dz_xx, dz_xx + z_xx.size(), h_xx.begin()); for (int ii = 0; ii < z_xx.size(); ii++) @@ -200,7 +200,7 @@ TEST_F(TestModulePsiMemory, set_memory_op_complex_double_gpu) TEST_F(TestModulePsiMemory, resize_memory_op_double_gpu) { double* xx_tmp = NULL; - resize_memory_double_gpu_op()(gpu_ctx, xx_tmp, xx.size()); + resize_memory_double_gpu_op()(xx_tmp, xx.size()); thrust::device_ptr d_xx(xx_tmp); thrust::copy(xx.begin(), xx.end(), d_xx); @@ -217,7 +217,7 @@ TEST_F(TestModulePsiMemory, resize_memory_op_double_gpu) TEST_F(TestModulePsiMemory, resize_memory_op_complex_double_gpu) { std::complex* z_xx_tmp = NULL; - resize_memory_comlex_double_gpu_op()(gpu_ctx, z_xx_tmp, z_xx.size()); + resize_memory_comlex_double_gpu_op()(z_xx_tmp, z_xx.size()); thrust::device_ptr> dz_xx(z_xx_tmp); thrust::copy(z_xx.begin(), z_xx.end(), dz_xx); @@ -236,7 +236,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_cpu_to_gpu) thrust::device_ptr d_xx = thrust::device_malloc(xx.size()); std::vector hv_xx(xx.size(), 0); thrust::copy(hv_xx.begin(), hv_xx.end(), d_xx); - synchronize_memory_double_cpu_to_gpu_op()(gpu_ctx, cpu_ctx, thrust::raw_pointer_cast(d_xx), xx.data(), xx.size()); + synchronize_memory_double_cpu_to_gpu_op()(thrust::raw_pointer_cast(d_xx), xx.data(), xx.size()); thrust::host_vector h_xx(xx.size()); thrust::copy(d_xx, d_xx + xx.size(), h_xx.begin()); @@ -252,9 +252,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_gpu_to_cpu) thrust::device_ptr d_xx = thrust::device_malloc(xx.size()); thrust::copy(xx.begin(), xx.end(), d_xx); thrust::host_vector h_xx(xx.size()); - synchronize_memory_double_gpu_to_cpu_op()(cpu_ctx, - gpu_ctx, - thrust::raw_pointer_cast(h_xx.data()), + synchronize_memory_double_gpu_to_cpu_op()(thrust::raw_pointer_cast(h_xx.data()), thrust::raw_pointer_cast(d_xx), xx.size()); @@ -270,9 +268,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_double_gpu_to_gpu) thrust::device_ptr d1_xx = thrust::device_malloc(xx.size()); thrust::device_ptr d2_xx = thrust::device_malloc(xx.size()); thrust::copy(xx.begin(), xx.end(), d1_xx); - synchronize_memory_double_gpu_to_gpu_op()(gpu_ctx, - gpu_ctx, - thrust::raw_pointer_cast(d2_xx), + synchronize_memory_double_gpu_to_gpu_op()(thrust::raw_pointer_cast(d2_xx), thrust::raw_pointer_cast(d1_xx), xx.size()); @@ -291,9 +287,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_cpu_to_gpu) thrust::device_ptr> dz_xx = thrust::device_malloc>(z_xx.size()); std::vector> hvz_xx(z_xx.size(), 0); thrust::copy(hvz_xx.begin(), hvz_xx.end(), dz_xx); - synchronize_memory_complex_double_cpu_to_gpu_op()(gpu_ctx, - cpu_ctx, - thrust::raw_pointer_cast(dz_xx), + synchronize_memory_complex_double_cpu_to_gpu_op()(thrust::raw_pointer_cast(dz_xx), z_xx.data(), z_xx.size()); @@ -311,9 +305,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_cpu) thrust::device_ptr> dz_xx = thrust::device_malloc>(z_xx.size()); thrust::copy(z_xx.begin(), z_xx.end(), dz_xx); thrust::host_vector> hz_xx(z_xx.size()); - synchronize_memory_complex_double_gpu_to_cpu_op()(cpu_ctx, - gpu_ctx, - thrust::raw_pointer_cast(hz_xx.data()), + synchronize_memory_complex_double_gpu_to_cpu_op()(thrust::raw_pointer_cast(hz_xx.data()), thrust::raw_pointer_cast(dz_xx), z_xx.size()); @@ -329,9 +321,7 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_gpu) thrust::device_ptr> dz1_xx = thrust::device_malloc>(z_xx.size()); thrust::device_ptr> dz2_xx = thrust::device_malloc>(z_xx.size()); thrust::copy(z_xx.begin(), z_xx.end(), dz1_xx); - synchronize_memory_complex_double_gpu_to_gpu_op()(gpu_ctx, - gpu_ctx, - thrust::raw_pointer_cast(dz2_xx), + synchronize_memory_complex_double_gpu_to_gpu_op()(thrust::raw_pointer_cast(dz2_xx), thrust::raw_pointer_cast(dz1_xx), z_xx.size()); @@ -348,13 +338,13 @@ TEST_F(TestModulePsiMemory, synchronize_memory_op_complex_double_gpu_to_gpu) TEST_F(TestModulePsiMemory, delete_memory_op_double_gpu) { thrust::device_ptr d_xx = thrust::device_malloc(xx.size()); - delete_memory_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(d_xx)); + delete_memory_double_gpu_op()(thrust::raw_pointer_cast(d_xx)); } TEST_F(TestModulePsiMemory, delete_memory_op_complex_double_gpu) { thrust::device_ptr> dz_xx = thrust::device_malloc>(z_xx.size()); - delete_memory_complex_double_gpu_op()(gpu_ctx, thrust::raw_pointer_cast(dz_xx)); + delete_memory_complex_double_gpu_op()(thrust::raw_pointer_cast(dz_xx)); } #endif // __UT_USE_CUDA || __UT_USE_ROCM diff --git a/source/module_base/parallel_device.h b/source/module_base/parallel_device.h index 09625f6303..7c41b8f28f 100644 --- a/source/module_base/parallel_device.h +++ b/source/module_base/parallel_device.h @@ -37,14 +37,14 @@ void bcast_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, { if(tmp_space == nullptr) { - base_device::memory::resize_memory_op()(cpu_ctx, object_cpu, n); + base_device::memory::resize_memory_op()(object_cpu, n); alloc = true; } else { object_cpu = tmp_space; } - base_device::memory::synchronize_memory_op()(cpu_ctx, ctx, object_cpu, object, n); + base_device::memory::synchronize_memory_op()(object_cpu, object, n); } else { @@ -55,10 +55,10 @@ void bcast_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm, if (base_device::get_device_type(ctx) == base_device::GpuDevice) { - base_device::memory::synchronize_memory_op()(ctx, cpu_ctx, object, object_cpu, n); + base_device::memory::synchronize_memory_op()(object, object_cpu, n); if(alloc) { - base_device::memory::delete_memory_op()(cpu_ctx, object_cpu); + base_device::memory::delete_memory_op()(object_cpu); } } return; @@ -74,14 +74,14 @@ void reduce_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm { if(tmp_space == nullptr) { - base_device::memory::resize_memory_op()(cpu_ctx, object_cpu, n); + base_device::memory::resize_memory_op()(object_cpu, n); alloc = true; } else { object_cpu = tmp_space; } - base_device::memory::synchronize_memory_op()(cpu_ctx, ctx, object_cpu, object, n); + base_device::memory::synchronize_memory_op()(object_cpu, object, n); } else { @@ -92,10 +92,10 @@ void reduce_dev(const Device* ctx, T* object, const int& n, const MPI_Comm& comm if (base_device::get_device_type(ctx) == base_device::GpuDevice) { - base_device::memory::synchronize_memory_op()(ctx, cpu_ctx, object, object_cpu, n); + base_device::memory::synchronize_memory_op()(object, object_cpu, n); if(alloc) { - base_device::memory::delete_memory_op()(cpu_ctx, object_cpu); + base_device::memory::delete_memory_op()(object_cpu); } } return; diff --git a/source/module_base/test/blas_connector_test.cpp b/source/module_base/test/blas_connector_test.cpp index 34f4cb51bb..dfe1e484b1 100644 --- a/source/module_base/test/blas_connector_test.cpp +++ b/source/module_base/test/blas_connector_test.cpp @@ -101,17 +101,17 @@ TEST(blas_connector, ScalGpu) { const int incx = 1; std::complex result[8], answer[8]; std::complex* result_gpu = nullptr; - resmem_zd_op()(gpu_ctx, result_gpu, 8 * sizeof(std::complex)); + resmem_zd_op()(result_gpu, 8 * sizeof(std::complex)); for (int i=0; i< size; i++) { result[i] = std::complex{static_cast(std::rand() / double(RAND_MAX)), static_cast(std::rand() / double(RAND_MAX))}; }; for (int i = 0; i < size; i++) answer[i] = result[i] * scale; - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result, sizeof(std::complex) * 8); + syncmem_z2z_h2d_op()(result_gpu, result, sizeof(std::complex) * 8); BlasConnector::scal(size,scale,result_gpu,incx,base_device::AbacusDevice_t::GpuDevice); - syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result, result_gpu, sizeof(std::complex) * 8); - delmem_zd_op()(gpu_ctx, result_gpu); + syncmem_z2z_d2h_op()(result, result_gpu, sizeof(std::complex) * 8); + delmem_zd_op()(result_gpu); // incx is the spacing between elements if result for (int i = 0; i < size; i++) { EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); @@ -198,8 +198,8 @@ TEST(blas_connector, AxpyGpu) { std::array x_const, result, answer; T* x_gpu = nullptr; T* result_gpu = nullptr; - resmem_zd_op()(gpu_ctx, x_gpu, size * sizeof(std::complex)); - resmem_zd_op()(gpu_ctx, result_gpu, size * sizeof(std::complex)); + resmem_zd_op()(x_gpu, size * sizeof(std::complex)); + resmem_zd_op()(result_gpu, size * sizeof(std::complex)); std::generate(x_const.begin(), x_const.end(), []() { return T{static_cast(std::rand() / double(RAND_MAX)), static_cast(std::rand() / double(RAND_MAX))}; @@ -210,12 +210,12 @@ TEST(blas_connector, AxpyGpu) { }); for (int i = 0; i < size; i++) answer[i] = x_const[i] * scale + result[i]; - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, x_gpu, x_const.data(), sizeof(std::complex) * size); + syncmem_z2z_h2d_op()(result_gpu, result.data(), sizeof(std::complex) * size); + syncmem_z2z_h2d_op()(x_gpu, x_const.data(), sizeof(std::complex) * size); BlasConnector::axpy(size, scale, x_gpu, incx, result_gpu, incy, base_device::AbacusDevice_t::GpuDevice); - syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size); - delmem_zd_op()(gpu_ctx, result_gpu); - delmem_zd_op()(gpu_ctx, x_gpu); + syncmem_z2z_d2h_op()(result.data(), result_gpu, sizeof(std::complex) * size); + delmem_zd_op()(result_gpu); + delmem_zd_op()(x_gpu); for (int i = 0; i < size; i++) { EXPECT_DOUBLE_EQ(answer[i].real(), result[i].real()); EXPECT_DOUBLE_EQ(answer[i].imag(), result[i].imag()); @@ -640,9 +640,9 @@ TEST(blas_connector, GemmGpu) { std::complex* a_gpu = nullptr; std::complex* b_gpu = nullptr; std::complex* result_gpu = nullptr; - resmem_zd_op()(gpu_ctx, a_gpu, size_k * lda * sizeof(std::complex)); - resmem_zd_op()(gpu_ctx, b_gpu, size_n * ldb * sizeof(std::complex)); - resmem_zd_op()(gpu_ctx, result_gpu, size_n * ldc * sizeof(std::complex)); + resmem_zd_op()(a_gpu, size_k * lda * sizeof(std::complex)); + resmem_zd_op()(b_gpu, size_n * ldb * sizeof(std::complex)); + resmem_zd_op()(result_gpu, size_n * ldc * sizeof(std::complex)); std::generate(a_const.begin(), a_const.end(), []() { return T{static_cast(std::rand() / double(RAND_MAX)), static_cast(std::rand() / double(RAND_MAX))}; @@ -665,16 +665,16 @@ TEST(blas_connector, GemmGpu) { beta_const * result[i + j * ldc]; } } - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, a_gpu, a_const.data(), sizeof(std::complex) * size_k * lda); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, b_gpu, b_const.data(), sizeof(std::complex) * size_n * ldb); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, result_gpu, result.data(), sizeof(std::complex) * size_n * ldc); + syncmem_z2z_h2d_op()(a_gpu, a_const.data(), sizeof(std::complex) * size_k * lda); + syncmem_z2z_h2d_op()(b_gpu, b_const.data(), sizeof(std::complex) * size_n * ldb); + syncmem_z2z_h2d_op()(result_gpu, result.data(), sizeof(std::complex) * size_n * ldc); BlasConnector::gemm_cm(transa_m, transb_m, size_m, size_n, size_k, alpha_const, a_gpu, lda, b_gpu, ldb, beta_const, result_gpu, ldc, base_device::AbacusDevice_t::GpuDevice); - syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, result.data(), result_gpu, sizeof(std::complex) * size_n * ldc); - delmem_zd_op()(gpu_ctx, result_gpu); - delmem_zd_op()(gpu_ctx, a_gpu); - delmem_zd_op()(gpu_ctx, b_gpu); + syncmem_z2z_d2h_op()(result.data(), result_gpu, sizeof(std::complex) * size_n * ldc); + delmem_zd_op()(result_gpu); + delmem_zd_op()(a_gpu); + delmem_zd_op()(b_gpu); for (int i = 0; i < size_m; i++) for (int j = 0; j < size_n; j++) { EXPECT_DOUBLE_EQ(answer[i + j * ldc].real(), diff --git a/source/module_basis/module_pw/kernels/test/pw_op_test.cpp b/source/module_basis/module_pw/kernels/test/pw_op_test.cpp index 96cc760383..6adac4613f 100644 --- a/source/module_basis/module_pw/kernels/test/pw_op_test.cpp +++ b/source/module_basis/module_pw/kernels/test/pw_op_test.cpp @@ -102,43 +102,43 @@ TEST_F(TestModulePWPWMultiDevice, set_3d_fft_box_op_gpu) std::vector> res(out_1.size(), std::complex{0, 0}); int * d_box_index = NULL; std::complex* d_res = NULL, * d_in_1 = NULL; - resize_memory_int_gpu_op()(gpu_ctx, d_box_index, box_index.size()); - resize_memory_complex_gpu_op()(gpu_ctx, d_res, res.size()); - resize_memory_complex_gpu_op()(gpu_ctx, d_in_1, in_1.size()); - synchronize_memory_int_h2d_op()(gpu_ctx, cpu_ctx, d_box_index, box_index.data(), box_index.size()); - synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); - synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in_1, in_1.data(), in_1.size()); + resize_memory_int_gpu_op()(d_box_index, box_index.size()); + resize_memory_complex_gpu_op()(d_res, res.size()); + resize_memory_complex_gpu_op()(d_in_1, in_1.size()); + synchronize_memory_int_h2d_op()(d_box_index, box_index.data(), box_index.size()); + synchronize_memory_complex_h2d_op()(d_res, res.data(), res.size()); + synchronize_memory_complex_h2d_op()(d_in_1, in_1.data(), in_1.size()); set_3d_fft_box_gpu_op()(gpu_ctx, this->npwk, d_box_index, d_in_1, d_res); - synchronize_memory_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); + synchronize_memory_complex_d2h_op()(res.data(), d_res, res.size()); for (int ii = 0; ii < this->nxyz; ii++) { EXPECT_LT(fabs(res[ii] - out_1[ii]), 1e-12); } - delete_memory_int_gpu_op()(gpu_ctx, d_box_index); - delete_memory_complex_gpu_op()(gpu_ctx, d_res); - delete_memory_complex_gpu_op()(gpu_ctx, d_in_1); + delete_memory_int_gpu_op()(d_box_index); + delete_memory_complex_gpu_op()(d_res); + delete_memory_complex_gpu_op()(d_in_1); } TEST_F(TestModulePWPWMultiDevice, set_recip_to_real_output_op_gpu) { std::vector> res(out_2.size(), std::complex{0, 0}); std::complex* d_res = NULL, * d_in_2 = NULL; - resize_memory_complex_gpu_op()(gpu_ctx, d_res, res.size()); - resize_memory_complex_gpu_op()(gpu_ctx, d_in_2, in_2.size()); - synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); - synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in_2, in_2.data(), in_2.size()); + resize_memory_complex_gpu_op()(d_res, res.size()); + resize_memory_complex_gpu_op()(d_in_2, in_2.size()); + synchronize_memory_complex_h2d_op()(d_res, res.data(), res.size()); + synchronize_memory_complex_h2d_op()(d_in_2, in_2.data(), in_2.size()); set_recip_to_real_output_gpu_op()(gpu_ctx, this->nxyz, this->add, this->factor, d_in_2, d_res); - synchronize_memory_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); + synchronize_memory_complex_d2h_op()(res.data(), d_res, res.size()); for (int ii = 0; ii < this->nxyz; ii++) { EXPECT_LT(fabs(res[ii] - out_2[ii]), 1e-12); } - delete_memory_complex_gpu_op()(gpu_ctx, d_res); - delete_memory_complex_gpu_op()(gpu_ctx, d_in_2); + delete_memory_complex_gpu_op()(d_res); + delete_memory_complex_gpu_op()(d_in_2); } TEST_F(TestModulePWPWMultiDevice, set_real_to_recip_output_op_gpu) @@ -146,23 +146,23 @@ TEST_F(TestModulePWPWMultiDevice, set_real_to_recip_output_op_gpu) std::vector> res = out_3_init; int * d_box_index = NULL; std::complex* d_res = NULL, * d_in_3 = NULL; - resize_memory_int_gpu_op()(gpu_ctx, d_box_index, box_index.size()); - resize_memory_complex_gpu_op()(gpu_ctx, d_res, res.size()); - resize_memory_complex_gpu_op()(gpu_ctx, d_in_3, in_3.size()); - synchronize_memory_int_h2d_op()(gpu_ctx, cpu_ctx, d_box_index, box_index.data(), box_index.size()); - synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); - synchronize_memory_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in_3, in_3.data(), in_3.size()); + resize_memory_int_gpu_op()(d_box_index, box_index.size()); + resize_memory_complex_gpu_op()(d_res, res.size()); + resize_memory_complex_gpu_op()(d_in_3, in_3.size()); + synchronize_memory_int_h2d_op()(d_box_index, box_index.data(), box_index.size()); + synchronize_memory_complex_h2d_op()(d_res, res.data(), res.size()); + synchronize_memory_complex_h2d_op()(d_in_3, in_3.data(), in_3.size()); set_real_to_recip_output_gpu_op()(gpu_ctx, this->npwk, this->nxyz, true, this->factor, d_box_index, d_in_3, d_res); - synchronize_memory_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); + synchronize_memory_complex_d2h_op()(res.data(), d_res, res.size()); for (int ii = 0; ii < out_3.size(); ii++) { EXPECT_LT(fabs(res[ii] - out_3[ii]), 5e-6); } - delete_memory_int_gpu_op()(gpu_ctx, d_box_index); - delete_memory_complex_gpu_op()(gpu_ctx, d_res); - delete_memory_complex_gpu_op()(gpu_ctx, d_in_3); + delete_memory_int_gpu_op()(d_box_index); + delete_memory_complex_gpu_op()(d_res); + delete_memory_complex_gpu_op()(d_in_3); } #endif // __UT_USE_CUDA || __UT_USE_ROCM diff --git a/source/module_basis/module_pw/module_fft/fft_cuda.cpp b/source/module_basis/module_pw/module_fft/fft_cuda.cpp index db93fb07fb..9bec9253e7 100644 --- a/source/module_basis/module_pw/module_fft/fft_cuda.cpp +++ b/source/module_basis/module_pw/module_fft/fft_cuda.cpp @@ -17,14 +17,14 @@ template <> void FFT_CUDA::setupFFT() { cufftPlan3d(&c_handle, this->nx, this->ny, this->nz, CUFFT_C2C); - resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz); + resmem_cd_op()(this->c_auxr_3d, this->nx * this->ny * this->nz); } template <> void FFT_CUDA::setupFFT() { cufftPlan3d(&z_handle, this->nx, this->ny, this->nz, CUFFT_Z2Z); - resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz); + resmem_zd_op()(this->z_auxr_3d, this->nx * this->ny * this->nz); } template <> void FFT_CUDA::cleanFFT() @@ -50,7 +50,7 @@ void FFT_CUDA::clear() this->cleanFFT(); if (c_auxr_3d != nullptr) { - delmem_cd_op()(gpu_ctx, c_auxr_3d); + delmem_cd_op()(c_auxr_3d); c_auxr_3d = nullptr; } } @@ -60,7 +60,7 @@ void FFT_CUDA::clear() this->cleanFFT(); if (z_auxr_3d != nullptr) { - delmem_zd_op()(gpu_ctx, z_auxr_3d); + delmem_zd_op()(z_auxr_3d); z_auxr_3d = nullptr; } } diff --git a/source/module_basis/module_pw/module_fft/fft_rocm.cpp b/source/module_basis/module_pw/module_fft/fft_rocm.cpp index 9973c72901..1dd9c433ec 100644 --- a/source/module_basis/module_pw/module_fft/fft_rocm.cpp +++ b/source/module_basis/module_pw/module_fft/fft_rocm.cpp @@ -16,14 +16,14 @@ template <> void FFT_ROCM::setupFFT() { hipfftPlan3d(&c_handle, this->nx, this->ny, this->nz, HIPFFT_C2C); - resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz); + resmem_cd_op()(this->c_auxr_3d, this->nx * this->ny * this->nz); } template <> void FFT_ROCM::setupFFT() { hipfftPlan3d(&z_handle, this->nx, this->ny, this->nz, HIPFFT_Z2Z); - resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz); + resmem_zd_op()(this->z_auxr_3d, this->nx * this->ny * this->nz); } template <> void FFT_ROCM::cleanFFT() @@ -49,7 +49,7 @@ void FFT_ROCM::clear() this->cleanFFT(); if (c_auxr_3d != nullptr) { - delmem_cd_op()(gpu_ctx, c_auxr_3d); + delmem_cd_op()(c_auxr_3d); c_auxr_3d = nullptr; } } @@ -59,7 +59,7 @@ void FFT_ROCM::clear() this->cleanFFT(); if (z_auxr_3d != nullptr) { - delmem_zd_op()(gpu_ctx, z_auxr_3d); + delmem_zd_op()(z_auxr_3d); z_auxr_3d = nullptr; } } diff --git a/source/module_basis/module_pw/pw_basis.cpp b/source/module_basis/module_pw/pw_basis.cpp index 7c8a1293da..5fbff68f0c 100644 --- a/source/module_basis/module_pw/pw_basis.cpp +++ b/source/module_basis/module_pw/pw_basis.cpp @@ -39,7 +39,7 @@ PW_Basis:: ~PW_Basis() delete[] gg_uniq; #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { - delmem_int_op()(gpu_ctx, this->d_is2fftixy); + delmem_int_op()(this->d_is2fftixy); } #endif } diff --git a/source/module_basis/module_pw/pw_basis_k.cpp b/source/module_basis/module_pw/pw_basis_k.cpp index f670ee9bf9..2e0f85372d 100644 --- a/source/module_basis/module_pw/pw_basis_k.cpp +++ b/source/module_basis/module_pw/pw_basis_k.cpp @@ -25,24 +25,24 @@ PW_Basis_K::~PW_Basis_K() #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { if (this->precision == "single") { - delmem_sd_op()(gpu_ctx, this->s_kvec_c); - delmem_sd_op()(gpu_ctx, this->s_gcar); - delmem_sd_op()(gpu_ctx, this->s_gk2); + delmem_sd_op()(this->s_kvec_c); + delmem_sd_op()(this->s_gcar); + delmem_sd_op()(this->s_gk2); } else { - delmem_dd_op()(gpu_ctx, this->d_gcar); - delmem_dd_op()(gpu_ctx, this->d_gk2); + delmem_dd_op()(this->d_gcar); + delmem_dd_op()(this->d_gk2); } - delmem_dd_op()(gpu_ctx, this->d_kvec_c); - delmem_int_op()(gpu_ctx, this->ig2ixyz_k); - delmem_int_op()(gpu_ctx, this->d_igl2isz_k); + delmem_dd_op()(this->d_kvec_c); + delmem_int_op()(this->ig2ixyz_k); + delmem_int_op()(this->d_igl2isz_k); } else { #endif if (this->precision == "single") { - delmem_sh_op()(cpu_ctx, this->s_kvec_c); - delmem_sh_op()(cpu_ctx, this->s_gcar); - delmem_sh_op()(cpu_ctx, this->s_gk2); + delmem_sh_op()(this->s_kvec_c); + delmem_sh_op()(this->s_gcar); + delmem_sh_op()(this->s_gk2); } // There's no need to delete double pointers while in a CPU environment. #if defined(__CUDA) || defined(__ROCM) @@ -99,17 +99,17 @@ void PW_Basis_K:: initparameters( #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { if (this->precision == "single") { - resmem_sd_op()(gpu_ctx, this->s_kvec_c, this->nks * 3); - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_kvec_c, reinterpret_cast(&this->kvec_c[0][0]), this->nks * 3); + resmem_sd_op()(this->s_kvec_c, this->nks * 3); + castmem_d2s_h2d_op()(this->s_kvec_c, reinterpret_cast(&this->kvec_c[0][0]), this->nks * 3); } - resmem_dd_op()(gpu_ctx, this->d_kvec_c, this->nks * 3); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_kvec_c, reinterpret_cast(&this->kvec_c[0][0]), this->nks * 3); + resmem_dd_op()(this->d_kvec_c, this->nks * 3); + syncmem_d2d_h2d_op()(this->d_kvec_c, reinterpret_cast(&this->kvec_c[0][0]), this->nks * 3); } else { #endif if (this->precision == "single") { - resmem_sh_op()(cpu_ctx, this->s_kvec_c, this->nks * 3); - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_kvec_c, reinterpret_cast(&this->kvec_c[0][0]), this->nks * 3); + resmem_sh_op()(this->s_kvec_c, this->nks * 3); + castmem_d2s_h2h_op()(this->s_kvec_c, reinterpret_cast(&this->kvec_c[0][0]), this->nks * 3); } this->d_kvec_c = reinterpret_cast(&this->kvec_c[0][0]); // There's no need to allocate double pointers while in a CPU environment. @@ -164,8 +164,8 @@ void PW_Basis_K::setupIndGk() } #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { - resmem_int_op()(gpu_ctx, this->d_igl2isz_k, this->npwk_max * this->nks); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_igl2isz_k, this->igl2isz_k, this->npwk_max * this->nks); + resmem_int_op()(this->d_igl2isz_k, this->npwk_max * this->nks); + syncmem_int_h2d_op()(this->d_igl2isz_k, this->igl2isz_k, this->npwk_max * this->nks); } #endif this->get_ig2ixyz_k(); @@ -247,25 +247,25 @@ void PW_Basis_K::collect_local_pw(const double& erf_ecut_in, const double& erf_h #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { if (this->precision == "single") { - resmem_sd_op()(gpu_ctx, this->s_gk2, this->npwk_max * this->nks); - resmem_sd_op()(gpu_ctx, this->s_gcar, this->npwk_max * this->nks * 3); - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_gk2, this->gk2, this->npwk_max * this->nks); - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_gcar, reinterpret_cast(&this->gcar[0][0]), this->npwk_max * this->nks * 3); + resmem_sd_op()(this->s_gk2, this->npwk_max * this->nks); + resmem_sd_op()(this->s_gcar, this->npwk_max * this->nks * 3); + castmem_d2s_h2d_op()(this->s_gk2, this->gk2, this->npwk_max * this->nks); + castmem_d2s_h2d_op()(this->s_gcar, reinterpret_cast(&this->gcar[0][0]), this->npwk_max * this->nks * 3); } else { - resmem_dd_op()(gpu_ctx, this->d_gk2, this->npwk_max * this->nks); - resmem_dd_op()(gpu_ctx, this->d_gcar, this->npwk_max * this->nks * 3); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_gk2, this->gk2, this->npwk_max * this->nks); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_gcar, reinterpret_cast(&this->gcar[0][0]), this->npwk_max * this->nks * 3); + resmem_dd_op()(this->d_gk2, this->npwk_max * this->nks); + resmem_dd_op()(this->d_gcar, this->npwk_max * this->nks * 3); + syncmem_d2d_h2d_op()(this->d_gk2, this->gk2, this->npwk_max * this->nks); + syncmem_d2d_h2d_op()(this->d_gcar, reinterpret_cast(&this->gcar[0][0]), this->npwk_max * this->nks * 3); } } else { #endif if (this->precision == "single") { - resmem_sh_op()(cpu_ctx, this->s_gk2, this->npwk_max * this->nks, "PW_B_K::s_gk2"); - resmem_sh_op()(cpu_ctx, this->s_gcar, this->npwk_max * this->nks * 3, "PW_B_K::s_gcar"); - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_gk2, this->gk2, this->npwk_max * this->nks); - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_gcar, reinterpret_cast(&this->gcar[0][0]), this->npwk_max * this->nks * 3); + resmem_sh_op()(this->s_gk2, this->npwk_max * this->nks, "PW_B_K::s_gk2"); + resmem_sh_op()(this->s_gcar, this->npwk_max * this->nks * 3, "PW_B_K::s_gcar"); + castmem_d2s_h2h_op()(this->s_gk2, this->gk2, this->npwk_max * this->nks); + castmem_d2s_h2h_op()(this->s_gcar, reinterpret_cast(&this->gcar[0][0]), this->npwk_max * this->nks * 3); } else { this->d_gcar = reinterpret_cast(&this->gcar[0][0]); @@ -355,8 +355,8 @@ void PW_Basis_K::get_ig2ixyz_k() ig2ixyz_k_cpu[igl + ik * npwk_max] = iz + iy * nz + ix * ny * nz; } } - resmem_int_op()(gpu_ctx, ig2ixyz_k, this->npwk_max * this->nks); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->ig2ixyz_k, ig2ixyz_k_cpu, this->npwk_max * this->nks); + resmem_int_op()(ig2ixyz_k, this->npwk_max * this->nks); + syncmem_int_h2d_op()(this->ig2ixyz_k, ig2ixyz_k_cpu, this->npwk_max * this->nks); delete[] ig2ixyz_k_cpu; } diff --git a/source/module_basis/module_pw/pw_basis_sup.cpp b/source/module_basis/module_pw/pw_basis_sup.cpp index 1d34682a96..e5422bd5d3 100644 --- a/source/module_basis/module_pw/pw_basis_sup.cpp +++ b/source/module_basis/module_pw/pw_basis_sup.cpp @@ -100,8 +100,9 @@ void PW_Basis_Sup::distribution_method3(const ModulePW::PW_Basis* pw_rho) this->npw_per = new int[this->poolnproc]; // number of planewaves on each core. delete[] this->fftixy2ip; this->fftixy2ip = new int[this->fftnxy]; // ip of core which contains the stick on (x, y). - for (int ixy = 0; ixy < this->fftnxy; ++ixy) + for (int ixy = 0; ixy < this->fftnxy; ++ixy) { this->fftixy2ip[ixy] = -1; // meaning this stick has not been distributed or there is no stick on (x, y). +} if (poolrank == 0) { // (1) Count the total number of planewaves (tot_npw) and sticks (this->nstot). @@ -212,10 +213,11 @@ void PW_Basis_Sup::divide_sticks_3( int fftnx_s = nx_s; if (this->gamma_only) { - if (this->xprime) + if (this->xprime) { fftnx_s = int(nx_s / 2) + 1; - else + } else { fftny_s = int(ny_s / 2) + 1; +} } int fftnxy_s = fftnx_s * fftny_s; @@ -225,15 +227,19 @@ void PW_Basis_Sup::divide_sticks_3( { int ix = ixy / fftny_s; int iy = ixy % fftny_s; - if (ix >= int(nx_s / 2) + 1) + if (ix >= int(nx_s / 2) + 1) { ix -= nx_s; - if (iy >= int(ny_s / 2) + 1) +} + if (iy >= int(ny_s / 2) + 1) { iy -= ny_s; +} - if (ix < 0) + if (ix < 0) { ix += nx; - if (iy < 0) +} + if (iy < 0) { iy += ny; +} int index = ix * this->fftny + iy; int ip = fftixy2ip_s[ixy]; if (ip >= 0) @@ -312,7 +318,7 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { - delmem_int_op()(gpu_ctx, this->d_is2fftixy); + delmem_int_op()(this->d_is2fftixy); d_is2fftixy = nullptr; } #endif @@ -349,8 +355,9 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( fftixy2is[ixy] = st_move; st_move++; } - if (st_move == this->nst) + if (st_move == this->nst) { break; +} } // distribute planewaves in the same order as smooth grids first. @@ -363,19 +370,25 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( int ixy = pw_rho->is2fftixy[is]; int ix = ixy / pw_rho->fftny; int iy = ixy % pw_rho->fftny; - if (ix >= int(pw_rho->nx / 2) + 1) + if (ix >= int(pw_rho->nx / 2) + 1) { ix -= pw_rho->nx; - if (iy >= int(pw_rho->ny / 2) + 1) +} + if (iy >= int(pw_rho->ny / 2) + 1) { iy -= pw_rho->ny; - if (iz >= int(pw_rho->nz / 2) + 1) +} + if (iz >= int(pw_rho->nz / 2) + 1) { iz -= pw_rho->nz; +} - if (ix < 0) + if (ix < 0) { ix += this->nx; - if (iy < 0) +} + if (iy < 0) { iy += this->ny; - if (iz < 0) +} + if (iz < 0) { iz += this->nz; +} int ixy_now = ix * this->fftny + iy; int index = ixy_now * this->nz + iz; int is_now = fftixy2is[ixy_now]; @@ -383,8 +396,9 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( this->ig2isz[ig] = isz_now; pw_filled++; found[index] = true; - if (xprime && ix == 0) + if (xprime && ix == 0) { ng_xeq0++; +} } assert(pw_filled == pw_rho->npw); @@ -397,21 +411,24 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( for (int iz = zstart; iz < zstart + st_length2D[ixy]; ++iz) { int z = iz; - if (z < 0) + if (z < 0) { z += this->nz; +} if (!found[ixy * this->nz + z]) { found[ixy * this->nz + z] = true; int is = fftixy2is[ixy]; this->ig2isz[pw_filled] = is * this->nz + z; pw_filled++; - if (xprime && ixy / fftny == 0) + if (xprime && ixy / fftny == 0) { ng_xeq0++; +} } } } - if (pw_filled == this->npw) + if (pw_filled == this->npw) { break; +} } delete[] fftixy2is; @@ -420,8 +437,8 @@ void PW_Basis_Sup::get_ig2isz_is2fftixy( #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { - resmem_int_op()(gpu_ctx, d_is2fftixy, this->nst); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_is2fftixy, this->is2fftixy, this->nst); + resmem_int_op()(d_is2fftixy, this->nst); + syncmem_int_h2d_op()(this->d_is2fftixy, this->is2fftixy, this->nst); } #endif return; diff --git a/source/module_basis/module_pw/pw_distributeg.cpp b/source/module_basis/module_pw/pw_distributeg.cpp index c93ff9357a..0e92d6f665 100644 --- a/source/module_basis/module_pw/pw_distributeg.cpp +++ b/source/module_basis/module_pw/pw_distributeg.cpp @@ -101,8 +101,10 @@ void PW_Basis::count_pw_st( // so that its index in st_length and st_bottom is 9 * 10 + 2 = 92. int x = ix; int y = iy; - if (x < 0) x += this->nx; - if (y < 0) y += this->ny; + if (x < 0) { x += this->nx; +} + if (y < 0) { y += this->ny; +} int index = x * this->fftny + y; int length = 0; // number of planewave on stick (x, y). @@ -114,13 +116,18 @@ void PW_Basis::count_pw_st( double modulus = f * (this->GGT * f); if (modulus <= this->ggecut || this->full_pw) { - if (length == 0) st_bottom2D[index] = iz; // length == 0 means this point is the bottom of stick (x, y). + if (length == 0) { st_bottom2D[index] = iz; // length == 0 means this point is the bottom of stick (x, y). +} ++this->npwtot; ++length; - if(iy < this->riy) this->riy = iy; - if(iy > this->liy) this->liy = iy; - if(ix < this->rix) this->rix = ix; - if(ix > this->lix) this->lix = ix; + if(iy < this->riy) { this->riy = iy; +} + if(iy > this->liy) { this->liy = iy; +} + if(ix < this->rix) { this->rix = ix; +} + if(ix > this->lix) { this->lix = ix; +} } } if (length > 0) @@ -157,7 +164,7 @@ void PW_Basis::get_ig2isz_is2fftixy( delete[] this->is2fftixy; this->is2fftixy = nullptr; // map is (index of sticks) to ixy (iy + ix * fftny). #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { - delmem_int_op()(gpu_ctx, this->d_is2fftixy); + delmem_int_op()(this->d_is2fftixy); d_is2fftixy = nullptr; } #endif @@ -182,20 +189,23 @@ void PW_Basis::get_ig2isz_is2fftixy( for (int iz = zstart; iz < zstart + st_length2D[ixy]; ++iz) { int z = iz; - if (z < 0) z += this->nz; + if (z < 0) { z += this->nz; +} this->ig2isz[pw_filled] = st_move * this->nz + z; pw_filled++; } this->is2fftixy[st_move] = ixy; st_move++; - if(xprime && ixy/fftny == 0) ng_xeq0 = pw_filled; + if(xprime && ixy/fftny == 0) { ng_xeq0 = pw_filled; +} } - if (st_move == this->nst && pw_filled == this->npw) break; + if (st_move == this->nst && pw_filled == this->npw) { break; +} } #if defined(__CUDA) || defined(__ROCM) if (this->device == "gpu") { - resmem_int_op()(gpu_ctx, d_is2fftixy, this->nst); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, this->d_is2fftixy, this->is2fftixy, this->nst); + resmem_int_op()(d_is2fftixy, this->nst); + syncmem_int_h2d_op()(this->d_is2fftixy, this->is2fftixy, this->nst); } #endif return; diff --git a/source/module_basis/module_pw/pw_transform_k.cpp b/source/module_basis/module_pw/pw_transform_k.cpp index 5e3780eef4..e230066c8f 100644 --- a/source/module_basis/module_pw/pw_transform_k.cpp +++ b/source/module_basis/module_pw/pw_transform_k.cpp @@ -345,8 +345,6 @@ void PW_Basis_K::real_to_recip(const base_device::DEVICE_GPU* ctx, assert(this->poolnproc == 1); base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()( - ctx, - ctx, this->fft_bundle.get_auxr_3d_data(), in, this->nrxx); @@ -379,9 +377,7 @@ void PW_Basis_K::real_to_recip(const base_device::DEVICE_GPU* ctx, base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, - base_device::DEVICE_GPU>()(ctx, - ctx, - this->fft_bundle.get_auxr_3d_data(), + base_device::DEVICE_GPU>()(this->fft_bundle.get_auxr_3d_data(), in, this->nrxx); @@ -413,7 +409,6 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_GPU* ctx, assert(this->poolnproc == 1); // ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxr_3d_data(), this->nxyz); base_device::memory::set_memory_op, base_device::DEVICE_GPU>()( - ctx, this->fft_bundle.get_auxr_3d_data(), 0, this->nxyz); @@ -450,7 +445,6 @@ void PW_Basis_K::recip_to_real(const base_device::DEVICE_GPU* ctx, assert(this->poolnproc == 1); // ModuleBase::GlobalFunc::ZEROS(fft_bundle.get_auxr_3d_data(), this->nxyz); base_device::memory::set_memory_op, base_device::DEVICE_GPU>()( - ctx, this->fft_bundle.get_auxr_3d_data(), 0, this->nxyz); diff --git a/source/module_elecstate/elecstate_pw.cpp b/source/module_elecstate/elecstate_pw.cpp index f55f2ec447..f241c59db8 100644 --- a/source/module_elecstate/elecstate_pw.cpp +++ b/source/module_elecstate/elecstate_pw.cpp @@ -33,26 +33,26 @@ ElecStatePW::~ElecStatePW() { if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") { - delmem_var_op()(this->ctx, this->rho_data); + delmem_var_op()(this->rho_data); delete[] this->rho; if (PARAM.globalv.double_grid || PARAM.globalv.use_uspp) { - delmem_complex_op()(this->ctx, this->rhog_data); + delmem_complex_op()(this->rhog_data); delete[] this->rhog; } if (get_xc_func_type() == 3 || PARAM.inp.out_elf[0] > 0) { - delmem_var_op()(this->ctx, this->kin_r_data); + delmem_var_op()(this->kin_r_data); delete[] this->kin_r; } } if (PARAM.globalv.use_uspp) { - delmem_var_op()(this->ctx, this->becsum); + delmem_var_op()(this->becsum); } - delmem_complex_op()(this->ctx, this->wfcr); - delmem_complex_op()(this->ctx, this->wfcr_another_spin); + delmem_complex_op()(this->wfcr); + delmem_complex_op()(this->wfcr_another_spin); } template @@ -66,7 +66,7 @@ void ElecStatePW::init_rho_data() if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") { this->rho = new Real*[this->charge->nspin]; - resmem_var_op()(this->ctx, this->rho_data, this->charge->nspin * this->charge->nrxx); + resmem_var_op()(this->rho_data, this->charge->nspin * this->charge->nrxx); for (int ii = 0; ii < this->charge->nspin; ii++) { this->rho[ii] = this->rho_data + ii * this->charge->nrxx; @@ -74,7 +74,7 @@ void ElecStatePW::init_rho_data() if (PARAM.globalv.double_grid || PARAM.globalv.use_uspp) { this->rhog = new T*[this->charge->nspin]; - resmem_complex_op()(this->ctx, this->rhog_data, this->charge->nspin * this->charge->rhopw->npw); + resmem_complex_op()(this->rhog_data, this->charge->nspin * this->charge->rhopw->npw); for (int ii = 0; ii < this->charge->nspin; ii++) { this->rhog[ii] = this->rhog_data + ii * this->charge->rhopw->npw; @@ -83,7 +83,7 @@ void ElecStatePW::init_rho_data() if (get_xc_func_type() == 3 || PARAM.inp.out_elf[0] > 0) { this->kin_r = new Real*[this->charge->nspin]; - resmem_var_op()(this->ctx, this->kin_r_data, this->charge->nspin * this->charge->nrxx); + resmem_var_op()(this->kin_r_data, this->charge->nspin * this->charge->nrxx); for (int ii = 0; ii < this->charge->nspin; ii++) { this->kin_r[ii] = this->kin_r_data + ii * this->charge->nrxx; } @@ -101,8 +101,8 @@ void ElecStatePW::init_rho_data() this->kin_r = reinterpret_cast(this->charge->kin_r); } } - resmem_complex_op()(this->ctx, this->wfcr, this->basis->nmaxgr, "ElecSPW::wfcr"); - resmem_complex_op()(this->ctx, this->wfcr_another_spin, this->basis->nrxx, "ElecSPW::wfcr_a"); + resmem_complex_op()(this->wfcr, this->basis->nmaxgr, "ElecSPW::wfcr"); + resmem_complex_op()(this->wfcr_another_spin, this->basis->nrxx, "ElecSPW::wfcr_a"); this->init_rho = true; } @@ -118,15 +118,15 @@ void ElecStatePW::psiToRho(const psi::Psi& psi) { // denghui replaced at 20221110 // ModuleBase::GlobalFunc::ZEROS(this->rho[is], this->charge->nrxx); - setmem_var_op()(this->ctx, this->rho[is], 0, this->charge->nrxx); + setmem_var_op()(this->rho[is], 0, this->charge->nrxx); if (get_xc_func_type() == 3) { // ModuleBase::GlobalFunc::ZEROS(this->charge->kin_r[is], this->charge->nrxx); - setmem_var_op()(this->ctx, this->kin_r[is], 0, this->charge->nrxx); + setmem_var_op()(this->kin_r[is], 0, this->charge->nrxx); } if (PARAM.globalv.double_grid || PARAM.globalv.use_uspp) { - setmem_complex_op()(this->ctx, this->rhog[is], 0, this->charge->rhopw->npw); + setmem_complex_op()(this->rhog[is], 0, this->charge->rhopw->npw); } } @@ -142,10 +142,10 @@ void ElecStatePW::psiToRho(const psi::Psi& psi) { for (int ii = 0; ii < PARAM.inp.nspin; ii++) { - castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->rho[ii], this->rho[ii], this->charge->nrxx); + castmem_var_d2h_op()(this->charge->rho[ii], this->rho[ii], this->charge->nrxx); if (get_xc_func_type() == 3) { - castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx); + castmem_var_d2h_op()(this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx); } } } @@ -244,7 +244,7 @@ void ElecStatePW::rhoBandK(const psi::Psi& psi) { for (int j = 0; j < 3; j++) { - setmem_complex_op()(this->ctx, this->wfcr, 0, this->charge->nrxx); + setmem_complex_op()(this->wfcr, 0, this->charge->nrxx); meta_op()(this->ctx, ik, @@ -277,10 +277,10 @@ void ElecStatePW::cal_becsum(const psi::Psi& psi) const int nkb = this->ppcell->nkb; this->vkb = this->ppcell->template get_vkb_data(); T* becp = nullptr; - resmem_complex_op()(this->ctx, becp, nbands * nkb, "ElecState::becp"); + resmem_complex_op()(becp, nbands * nkb, "ElecState::becp"); const int nh_tot = this->ppcell->nhm * (this->ppcell->nhm + 1) / 2; - resmem_var_op()(this->ctx, becsum, nh_tot * ucell->nat * PARAM.inp.nspin, "ElecState::becsum"); - setmem_var_op()(this->ctx, becsum, 0, nh_tot * ucell->nat * PARAM.inp.nspin); + resmem_var_op()(becsum, nh_tot * ucell->nat * PARAM.inp.nspin, "ElecState::becsum"); + setmem_var_op()(becsum, 0, nh_tot * ucell->nat * PARAM.inp.nspin); for (int ik = 0; ik < psi.get_nk(); ++ik) { @@ -340,10 +340,9 @@ void ElecStatePW::cal_becsum(const psi::Psi& psi) if (atom->ncpp.tvanp) { T *auxk1 = nullptr, *auxk2 = nullptr, *aux_gk = nullptr; - resmem_complex_op()(this->ctx, auxk1, nbands * atom->ncpp.nh, "ElecState::auxk1"); - resmem_complex_op()(this->ctx, auxk2, nbands * atom->ncpp.nh, "ElecState::auxk2"); - resmem_complex_op()(this->ctx, - aux_gk, + resmem_complex_op()(auxk1, nbands * atom->ncpp.nh, "ElecState::auxk1"); + resmem_complex_op()(auxk2, nbands * atom->ncpp.nh, "ElecState::auxk2"); + resmem_complex_op()(aux_gk, atom->ncpp.nh * atom->ncpp.nh * npol * npol, "ElecState::aux_gk"); for (int ia = 0; ia < atom->na; ia++) @@ -414,13 +413,13 @@ void ElecStatePW::cal_becsum(const psi::Psi& psi) } } } - delmem_complex_op()(this->ctx, auxk1); - delmem_complex_op()(this->ctx, auxk2); - delmem_complex_op()(this->ctx, aux_gk); + delmem_complex_op()(auxk1); + delmem_complex_op()(auxk2); + delmem_complex_op()(aux_gk); } } } - delmem_complex_op()(this->ctx, becp); + delmem_complex_op()(becp); } template @@ -469,11 +468,11 @@ void ElecStatePW::addusdens_g(const Real* becsum, T** rhog) const std::complex ci_tpi = ModuleBase::NEG_IMAG_UNIT * ModuleBase::TWO_PI; Real* qmod = nullptr; - resmem_var_op()(this->ctx, qmod, npw, "ElecState::qmod"); + resmem_var_op()(qmod, npw, "ElecState::qmod"); T* qgm = nullptr; - resmem_complex_op()(this->ctx, qgm, npw, "ElecState::qgm"); + resmem_complex_op()(qgm, npw, "ElecState::qgm"); Real* ylmk0 = nullptr; - resmem_var_op()(this->ctx, ylmk0, npw * lmaxq * lmaxq, "ElecState::ylmk0"); + resmem_var_op()(ylmk0, npw * lmaxq * lmaxq, "ElecState::ylmk0"); Real* g = reinterpret_cast(this->charge->rhopw->gcar); ModuleBase::YlmReal::Ylm_Real(this->ctx, lmaxq * lmaxq, npw, g, ylmk0); @@ -492,9 +491,9 @@ void ElecStatePW::addusdens_g(const Real* becsum, T** rhog) const int nij = atom->ncpp.nh * (atom->ncpp.nh + 1) / 2; T *skk = nullptr, *aux2 = nullptr, *tbecsum = nullptr; - resmem_complex_op()(this->ctx, skk, atom->na * npw, "ElecState::skk"); - resmem_complex_op()(this->ctx, aux2, nij * npw, "ElecState::aux2"); - resmem_complex_op()(this->ctx, tbecsum, PARAM.inp.nspin * atom->na * nij, "ElecState::tbecsum"); + resmem_complex_op()(skk, atom->na * npw, "ElecState::skk"); + resmem_complex_op()(aux2, nij * npw, "ElecState::aux2"); + resmem_complex_op()(tbecsum, PARAM.inp.nspin * atom->na * nij, "ElecState::tbecsum"); for (int ia = 0; ia < atom->na; ia++) { const int iat = ucell->itia2iat(it, ia); @@ -548,15 +547,15 @@ void ElecStatePW::addusdens_g(const Real* becsum, T** rhog) } } } - delmem_complex_op()(this->ctx, skk); - delmem_complex_op()(this->ctx, aux2); - delmem_complex_op()(this->ctx, tbecsum); + delmem_complex_op()(skk); + delmem_complex_op()(aux2); + delmem_complex_op()(tbecsum); } } - delmem_var_op()(this->ctx, qmod); - delmem_complex_op()(this->ctx, qgm); - delmem_var_op()(this->ctx, ylmk0); + delmem_var_op()(qmod); + delmem_complex_op()(qgm); + delmem_var_op()(ylmk0); } template class ElecStatePW, base_device::DEVICE_CPU>; diff --git a/source/module_elecstate/elecstate_pw_cal_tau.cpp b/source/module_elecstate/elecstate_pw_cal_tau.cpp index ad8c9ce42f..5c225c3d62 100644 --- a/source/module_elecstate/elecstate_pw_cal_tau.cpp +++ b/source/module_elecstate/elecstate_pw_cal_tau.cpp @@ -9,7 +9,7 @@ void ElecStatePW::cal_tau(const psi::Psi& psi) ModuleBase::TITLE("ElecStatePW", "cal_tau"); for(int is=0; isctx, this->kin_r[is], 0, this->charge->nrxx); + setmem_var_op()(this->kin_r[is], 0, this->charge->nrxx); } for (int ik = 0; ik < psi.get_nk(); ++ik) @@ -31,7 +31,7 @@ void ElecStatePW::cal_tau(const psi::Psi& psi) // kinetic energy density for (int j = 0; j < 3; j++) { - setmem_complex_op()(this->ctx, this->wfcr, 0, this->charge->nrxx); + setmem_complex_op()(this->wfcr, 0, this->charge->nrxx); meta_op()(this->ctx, ik, @@ -52,7 +52,7 @@ void ElecStatePW::cal_tau(const psi::Psi& psi) } if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") { for (int ii = 0; ii < PARAM.inp.nspin; ii++) { - castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx); + castmem_var_d2h_op()(this->charge->kin_r[ii], this->kin_r[ii], this->charge->nrxx); } } this->parallelK(); diff --git a/source/module_elecstate/elecstate_pw_sdft.cpp b/source/module_elecstate/elecstate_pw_sdft.cpp index ad6f98c3c3..bef6277adb 100644 --- a/source/module_elecstate/elecstate_pw_sdft.cpp +++ b/source/module_elecstate/elecstate_pw_sdft.cpp @@ -16,7 +16,7 @@ void ElecStatePW_SDFT::psiToRho(const psi::Psi& psi) const int nspin = PARAM.inp.nspin; for (int is = 0; is < nspin; is++) { - setmem_var_op()(this->ctx, this->rho[is], 0, this->charge->nrxx); + setmem_var_op()(this->rho[is], 0, this->charge->nrxx); } if (GlobalV::MY_STOGROUP == 0) @@ -28,7 +28,7 @@ void ElecStatePW_SDFT::psiToRho(const psi::Psi& psi) } if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") { for (int ii = 0; ii < nspin; ii++) { - castmem_var_d2h_op()(cpu_ctx, this->ctx, this->charge->rho[ii], this->rho[ii], this->charge->nrxx); + castmem_var_d2h_op()(this->charge->rho[ii], this->rho[ii], this->charge->nrxx); } } this->parallelK(); diff --git a/source/module_elecstate/kernels/test/elecstate_op_test.cpp b/source/module_elecstate/kernels/test/elecstate_op_test.cpp index 79635c7895..973df83cea 100644 --- a/source/module_elecstate/kernels/test/elecstate_op_test.cpp +++ b/source/module_elecstate/kernels/test/elecstate_op_test.cpp @@ -107,10 +107,10 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_op_gpu) std::vector rho_data(expected_rho.size(), 0); double* d_rho_data = NULL; std::complex* d_wfcr = NULL; - resize_memory_var_op()(gpu_ctx, d_rho_data, rho_data.size()); - resize_memory_complex_op()(gpu_ctx, d_wfcr, wfcr.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_rho_data, rho_data.data(), rho_data.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_wfcr, wfcr.data(), wfcr.size()); + resize_memory_var_op()(d_rho_data, rho_data.size()); + resize_memory_complex_op()(d_wfcr, wfcr.size()); + syncmem_var_h2d_op()(d_rho_data, rho_data.data(), rho_data.size()); + syncmem_complex_h2d_op()(d_wfcr, wfcr.data(), wfcr.size()); double ** rho = new double* [1]; rho[0] = d_rho_data; elecstate_gpu_op()( @@ -120,7 +120,7 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_op_gpu) rho, d_wfcr); - syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, rho_data.data(), d_rho_data, rho_data.size()); + syncmem_var_d2h_op()(rho_data.data(), d_rho_data, rho_data.size()); // check the result for (int ii = 0; ii < rho_data.size(); ii++) { EXPECT_LT(fabs(rho_data[ii] - expected_rho[ii]), 6e-5); @@ -136,12 +136,12 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_spin_op_gpu) double* d_rho_data_2 = NULL; std::complex* d_wfcr_2 = NULL; std::complex* d_wfcr_another_spin_2 = NULL; - resize_memory_var_op()(gpu_ctx, d_rho_data_2, rho_data_2.size()); - resize_memory_complex_op()(gpu_ctx, d_wfcr_2, wfcr_2.size()); - resize_memory_complex_op()(gpu_ctx, d_wfcr_another_spin_2, wfcr_another_spin_2.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_rho_data_2, rho_data_2.data(), rho_data_2.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_wfcr_2, wfcr_2.data(), wfcr_2.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_wfcr_another_spin_2, wfcr_another_spin_2.data(), wfcr_another_spin_2.size()); + resize_memory_var_op()(d_rho_data_2, rho_data_2.size()); + resize_memory_complex_op()(d_wfcr_2, wfcr_2.size()); + resize_memory_complex_op()(d_wfcr_another_spin_2, wfcr_another_spin_2.size()); + syncmem_var_h2d_op()(d_rho_data_2, rho_data_2.data(), rho_data_2.size()); + syncmem_complex_h2d_op()(d_wfcr_2, wfcr_2.data(), wfcr_2.size()); + syncmem_complex_h2d_op()(d_wfcr_another_spin_2, wfcr_another_spin_2.data(), wfcr_another_spin_2.size()); double ** rho = new double* [4]; rho[0] = d_rho_data_2; rho[1] = d_rho_data_2 + this->nrxx; @@ -158,7 +158,7 @@ TEST_F(TestModuleElecstateMultiDevice, elecstate_pw_spin_op_gpu) d_wfcr_2, d_wfcr_another_spin_2); - syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, rho_data_2.data(), d_rho_data_2, rho_data_2.size()); + syncmem_var_d2h_op()(rho_data_2.data(), d_rho_data_2, rho_data_2.size()); // check the result for (int ii = 0; ii < rho_data_2.size(); ii++) { EXPECT_LT(fabs(rho_data_2[ii] - expected_rho_2[ii]), 5e-4); diff --git a/source/module_elecstate/potentials/potential_new.cpp b/source/module_elecstate/potentials/potential_new.cpp index a4443c46d8..f3d68df05a 100644 --- a/source/module_elecstate/potentials/potential_new.cpp +++ b/source/module_elecstate/potentials/potential_new.cpp @@ -50,18 +50,18 @@ Potential::~Potential() } if (PARAM.inp.basis_type == "pw" && PARAM.inp.device == "gpu") { if (PARAM.inp.precision == "single") { - delmem_sd_op()(gpu_ctx, s_veff_smooth); - delmem_sd_op()(gpu_ctx, s_vofk_smooth); + delmem_sd_op()(s_veff_smooth); + delmem_sd_op()(s_vofk_smooth); } else { - delmem_dd_op()(gpu_ctx, d_veff_smooth); - delmem_dd_op()(gpu_ctx, d_vofk_smooth); + delmem_dd_op()(d_veff_smooth); + delmem_dd_op()(d_vofk_smooth); } } else { if (PARAM.inp.precision == "single") { - delmem_sh_op()(cpu_ctx, s_veff_smooth); - delmem_sh_op()(cpu_ctx, s_vofk_smooth); + delmem_sh_op()(s_veff_smooth); + delmem_sh_op()(s_vofk_smooth); } } } @@ -133,18 +133,18 @@ void Potential::allocate() } if (PARAM.inp.basis_type == "pw" && PARAM.inp.device == "gpu") { if (PARAM.inp.precision == "single") { - resmem_sd_op()(gpu_ctx, s_veff_smooth, PARAM.inp.nspin * nrxx_smooth); - resmem_sd_op()(gpu_ctx, s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth); + resmem_sd_op()(s_veff_smooth, PARAM.inp.nspin * nrxx_smooth); + resmem_sd_op()(s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth); } else { - resmem_dd_op()(gpu_ctx, d_veff_smooth, PARAM.inp.nspin * nrxx_smooth); - resmem_dd_op()(gpu_ctx, d_vofk_smooth, PARAM.inp.nspin * nrxx_smooth); + resmem_dd_op()(d_veff_smooth, PARAM.inp.nspin * nrxx_smooth); + resmem_dd_op()(d_vofk_smooth, PARAM.inp.nspin * nrxx_smooth); } } else { if (PARAM.inp.precision == "single") { - resmem_sh_op()(cpu_ctx, s_veff_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::sveff_smooth"); - resmem_sh_op()(cpu_ctx, s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::svofk_smooth"); + resmem_sh_op()(s_veff_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::sveff_smooth"); + resmem_sh_op()(s_vofk_smooth, PARAM.inp.nspin * nrxx_smooth, "POT::svofk_smooth"); } else { this->d_veff_smooth = this->veff_smooth.c; @@ -181,40 +181,28 @@ void Potential::update_from_charge(const Charge*const chg, const UnitCell*const if (PARAM.inp.basis_type == "pw" && PARAM.inp.device == "gpu") { if (PARAM.inp.precision == "single") { - castmem_d2s_h2d_op()(gpu_ctx, - cpu_ctx, - s_veff_smooth, + castmem_d2s_h2d_op()(s_veff_smooth, this->veff_smooth.c, this->veff_smooth.nr * this->veff_smooth.nc); - castmem_d2s_h2d_op()(gpu_ctx, - cpu_ctx, - s_vofk_smooth, + castmem_d2s_h2d_op()(s_vofk_smooth, this->vofk_smooth.c, this->vofk_smooth.nr * this->vofk_smooth.nc); } else { - syncmem_d2d_h2d_op()(gpu_ctx, - cpu_ctx, - d_veff_smooth, + syncmem_d2d_h2d_op()(d_veff_smooth, this->veff_smooth.c, this->veff_smooth.nr * this->veff_smooth.nc); - syncmem_d2d_h2d_op()(gpu_ctx, - cpu_ctx, - d_vofk_smooth, + syncmem_d2d_h2d_op()(d_vofk_smooth, this->vofk_smooth.c, this->vofk_smooth.nr * this->vofk_smooth.nc); } } else { if (PARAM.inp.precision == "single") { - castmem_d2s_h2h_op()(cpu_ctx, - cpu_ctx, - s_veff_smooth, + castmem_d2s_h2h_op()(s_veff_smooth, this->veff_smooth.c, this->veff_smooth.nr * this->veff_smooth.nc); - castmem_d2s_h2h_op()(cpu_ctx, - cpu_ctx, - s_vofk_smooth, + castmem_d2s_h2h_op()(s_vofk_smooth, this->vofk_smooth.c, this->vofk_smooth.nr * this->vofk_smooth.nc); } diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp index 84bf0fe8a4..a96d487a5c 100644 --- a/source/module_esolver/esolver_ks_pw.cpp +++ b/source/module_esolver/esolver_ks_pw.cpp @@ -646,9 +646,7 @@ void ESolver_KS_PW::after_scf(UnitCell& ucell, const int istep) // 4) Transfer data from GPU to CPU if (this->device == base_device::GpuDevice) { - castmem_2d_d2h_op()(this->psi[0].get_device(), - this->kspw_psi[0].get_device(), - this->psi[0].get_pointer() - this->psi[0].get_psi_bias(), + castmem_2d_d2h_op()(this->psi[0].get_pointer() - this->psi[0].get_psi_bias(), this->kspw_psi[0].get_pointer() - this->kspw_psi[0].get_psi_bias(), this->psi[0].size()); } diff --git a/source/module_hamilt_general/hamilt.h b/source/module_hamilt_general/hamilt.h index 70dcd1b20a..cb204cc298 100644 --- a/source/module_hamilt_general/hamilt.h +++ b/source/module_hamilt_general/hamilt.h @@ -39,7 +39,7 @@ class Hamilt const int nbands // number of bands ) const { - syncmem_op()(this->ctx, this->ctx, spsi, psi_in, static_cast(nbands * nrow)); + syncmem_op()(spsi, psi_in, static_cast(nbands * nrow)); } /// core function: return H(k) and S(k) matrixs for direct solving eigenvalues. diff --git a/source/module_hamilt_general/operator.cpp b/source/module_hamilt_general/operator.cpp index 008d5e30e3..e9020866e6 100644 --- a/source/module_hamilt_general/operator.cpp +++ b/source/module_hamilt_general/operator.cpp @@ -59,7 +59,7 @@ typename Operator::hpsi_info Operator::hPsi(hpsi_info& inp if (this->in_place) { // ModuleBase::GlobalFunc::COPYARRAY(this->hpsi->get_pointer(), hpsi_pointer, this->hpsi->size()); - syncmem_op()(this->ctx, this->ctx, hpsi_pointer, this->hpsi->get_pointer(), this->hpsi->size()); + syncmem_op()(hpsi_pointer, this->hpsi->get_pointer(), this->hpsi->size()); delete this->hpsi; this->hpsi = new psi::Psi(hpsi_pointer, 1, diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp index 87a2fa41cc..36baed7bab 100644 --- a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp +++ b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp @@ -27,8 +27,8 @@ void spinconstrain::SpinConstrain>::calculate_delta_hcc(std #if ((defined __CUDA) || (defined __ROCM)) base_device::DEVICE_GPU* ctx = {}; base_device::DEVICE_CPU* cpu_ctx = {}; - base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(cpu_ctx, becp_cpu, size_ps); - base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, becp_cpu, becp_k, size_ps); + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(becp_cpu, size_ps); + base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(becp_cpu, becp_k, size_ps); #endif } else if (PARAM.inp.device == "cpu") @@ -68,8 +68,8 @@ void spinconstrain::SpinConstrain>::calculate_delta_hcc(std #if ((defined __CUDA) || (defined __ROCM)) base_device::DEVICE_GPU* ctx = {}; base_device::DEVICE_CPU* cpu_ctx = {}; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, ps_pointer, size_ps); - base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ctx, cpu_ctx, ps_pointer, ps.data(), size_ps); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ps_pointer, size_ps); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ps_pointer, ps.data(), size_ps); #endif } else if (PARAM.inp.device == "cpu") @@ -100,7 +100,7 @@ void spinconstrain::SpinConstrain>::calculate_delta_hcc(std h_tmp, nbands ); - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, ps_pointer); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ps_pointer); delete[] becp_cpu; #endif @@ -260,20 +260,20 @@ void spinconstrain::SpinConstrain>::cal_mw_from_lambda(int becp_tmp.resize(size_becp * nk); std::complex* h_tmp = nullptr; std::complex* s_tmp = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands); - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(h_tmp, nbands * nbands); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(s_tmp, nbands * nbands); int initial_hs = 0; if(this->sub_h_save == nullptr) { initial_hs = 1; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, this->sub_h_save, nbands * nbands * nk); - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, this->sub_s_save, nbands * nbands * nk); - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, this->becp_save, size_becp * nk); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(this->sub_h_save, nbands * nbands * nk); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(this->sub_s_save, nbands * nbands * nk); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(this->becp_save, size_becp * nk); } std::complex* becp_pointer = nullptr; // allocate memory for becp_pointer in GPU device - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(becp_pointer, size_becp); for (int ik = 0; ik < nk; ++ik) { psi_t->fix_k(ik); @@ -286,10 +286,10 @@ void spinconstrain::SpinConstrain>::cal_mw_from_lambda(int /// update H(k) for each k point hamilt_t->updateHk(ik); hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k); - base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, becp_k, onsite_p->get_becp(), size_becp); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(becp_k, onsite_p->get_becp(), size_becp); } - base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands); - base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(h_tmp, h_k, nbands * nbands); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(s_tmp, s_k, nbands * nbands); // update h_tmp by delta_lambda if (i_step != -1) this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat); @@ -301,11 +301,11 @@ void spinconstrain::SpinConstrain>::cal_mw_from_lambda(int nkb * npol, &this->pelec->ekb(ik, 0)); // copy becp_pointer from GPU to CPU - base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp); + base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(&becp_tmp[ik * size_becp], becp_pointer, size_becp); } // free memory for becp_pointer in GPU device - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(becp_pointer); } #endif // calculate weights from ekb to update wg @@ -462,8 +462,8 @@ void spinconstrain::SpinConstrain>::update_psi_charge(const std::complex* h_tmp = nullptr; std::complex* s_tmp = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands); - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(h_tmp, nbands * nbands); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(s_tmp, nbands * nbands); assert(this->sub_h_save != nullptr); assert(this->sub_s_save != nullptr); assert(this->becp_save != nullptr); @@ -474,8 +474,8 @@ void spinconstrain::SpinConstrain>::update_psi_charge(const std::complex* becp_k = this->becp_save + ik * size_becp; psi_t->fix_k(ik); - base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands); - base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(h_tmp, h_k, nbands * nbands); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(s_tmp, s_k, nbands * nbands); this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat); hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::diag_subspace_psi(h_tmp, s_tmp, @@ -484,9 +484,9 @@ void spinconstrain::SpinConstrain>::update_psi_charge(const &this->pelec->ekb(ik, 0)); } - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, sub_h_save); - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, sub_s_save); - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, becp_save); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(sub_h_save); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(sub_s_save); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(becp_save); this->sub_h_save = nullptr; this->sub_s_save = nullptr; this->becp_save = nullptr; diff --git a/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp index b41c8f476e..bcd0cba74d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/VNL_in_pw.cpp @@ -36,42 +36,42 @@ void pseudopot_cell_vnl::release_memory() { if (PARAM.inp.precision == "single") { - delmem_sd_op()(gpu_ctx, this->s_deeq); - delmem_sd_op()(gpu_ctx, this->s_nhtol); - delmem_sd_op()(gpu_ctx, this->s_nhtolm); - delmem_sd_op()(gpu_ctx, this->s_indv); - delmem_sd_op()(gpu_ctx, this->s_tab); - delmem_sd_op()(gpu_ctx, this->s_qq_nt); - delmem_cd_op()(gpu_ctx, this->c_deeq_nc); - delmem_cd_op()(gpu_ctx, this->c_vkb); - delmem_cd_op()(gpu_ctx, this->c_qq_so); + delmem_sd_op()(this->s_deeq); + delmem_sd_op()(this->s_nhtol); + delmem_sd_op()(this->s_nhtolm); + delmem_sd_op()(this->s_indv); + delmem_sd_op()(this->s_tab); + delmem_sd_op()(this->s_qq_nt); + delmem_cd_op()(this->c_deeq_nc); + delmem_cd_op()(this->c_vkb); + delmem_cd_op()(this->c_qq_so); } else { - delmem_zd_op()(gpu_ctx, this->z_deeq_nc); - delmem_zd_op()(gpu_ctx, this->z_qq_so); + delmem_zd_op()(this->z_deeq_nc); + delmem_zd_op()(this->z_qq_so); } - delmem_dd_op()(gpu_ctx, this->d_deeq); - delmem_zd_op()(gpu_ctx, this->z_vkb); - delmem_dd_op()(gpu_ctx, this->d_tab); - delmem_dd_op()(gpu_ctx, this->d_indv); - delmem_dd_op()(gpu_ctx, this->d_nhtol); - delmem_dd_op()(gpu_ctx, this->d_nhtolm); - delmem_dd_op()(gpu_ctx, this->d_qq_nt); + delmem_dd_op()(this->d_deeq); + delmem_zd_op()(this->z_vkb); + delmem_dd_op()(this->d_tab); + delmem_dd_op()(this->d_indv); + delmem_dd_op()(this->d_nhtol); + delmem_dd_op()(this->d_nhtolm); + delmem_dd_op()(this->d_qq_nt); } else { if (PARAM.inp.precision == "single") { - delmem_sh_op()(cpu_ctx, this->s_deeq); - delmem_sh_op()(cpu_ctx, this->s_nhtol); - delmem_sh_op()(cpu_ctx, this->s_nhtolm); - delmem_sh_op()(cpu_ctx, this->s_indv); - delmem_sh_op()(cpu_ctx, this->s_tab); - delmem_sh_op()(cpu_ctx, this->s_qq_nt); - delmem_ch_op()(cpu_ctx, this->c_deeq_nc); - delmem_ch_op()(cpu_ctx, this->c_vkb); - delmem_ch_op()(cpu_ctx, this->c_qq_so); + delmem_sh_op()(this->s_deeq); + delmem_sh_op()(this->s_nhtol); + delmem_sh_op()(this->s_nhtolm); + delmem_sh_op()(this->s_indv); + delmem_sh_op()(this->s_tab); + delmem_sh_op()(this->s_qq_nt); + delmem_ch_op()(this->c_deeq_nc); + delmem_ch_op()(this->c_vkb); + delmem_ch_op()(this->c_qq_so); } // There's no need to delete double precision pointers while in a CPU environment. } @@ -158,42 +158,40 @@ void pseudopot_cell_vnl::init(const UnitCell& ucell, { if (PARAM.inp.precision == "single") { - resmem_sd_op()(gpu_ctx, s_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); - resmem_sd_op()(gpu_ctx, s_nhtol, ntype * this->nhm); - resmem_sd_op()(gpu_ctx, s_nhtolm, ntype * this->nhm); - resmem_sd_op()(gpu_ctx, s_indv, ntype * this->nhm); - resmem_sd_op()(gpu_ctx, s_qq_nt, ntype * this->nhm * this->nhm); - resmem_cd_op()(gpu_ctx, c_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); - resmem_cd_op()(gpu_ctx, c_qq_so, ntype * 4 * this->nhm * this->nhm); + resmem_sd_op()(s_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); + resmem_sd_op()(s_nhtol, ntype * this->nhm); + resmem_sd_op()(s_nhtolm, ntype * this->nhm); + resmem_sd_op()(s_indv, ntype * this->nhm); + resmem_sd_op()(s_qq_nt, ntype * this->nhm * this->nhm); + resmem_cd_op()(c_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); + resmem_cd_op()(c_qq_so, ntype * 4 * this->nhm * this->nhm); } else { - resmem_zd_op()(gpu_ctx, z_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); - resmem_zd_op()(gpu_ctx, z_qq_so, ntype * 4 * this->nhm * this->nhm); + resmem_zd_op()(z_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); + resmem_zd_op()(z_qq_so, ntype * 4 * this->nhm * this->nhm); } - resmem_dd_op()(gpu_ctx, d_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); - resmem_dd_op()(gpu_ctx, d_indv, ntype * this->nhm); - resmem_dd_op()(gpu_ctx, d_nhtol, ntype * this->nhm); - resmem_dd_op()(gpu_ctx, d_nhtolm, ntype * this->nhm); - resmem_dd_op()(gpu_ctx, d_qq_nt, ntype * this->nhm * this->nhm); + resmem_dd_op()(d_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm); + resmem_dd_op()(d_indv, ntype * this->nhm); + resmem_dd_op()(d_nhtol, ntype * this->nhm); + resmem_dd_op()(d_nhtolm, ntype * this->nhm); + resmem_dd_op()(d_qq_nt, ntype * this->nhm * this->nhm); } else { if (PARAM.inp.precision == "single") { - resmem_sh_op()(cpu_ctx, - s_deeq, + resmem_sh_op()(s_deeq, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm, "VNL::s_deeq"); - resmem_sh_op()(cpu_ctx, s_nhtol, ntype * this->nhm, "VNL::s_nhtol"); - resmem_sh_op()(cpu_ctx, s_nhtolm, ntype * this->nhm, "VNL::s_nhtolm"); - resmem_sh_op()(cpu_ctx, s_indv, ntype * this->nhm, "VNL::s_indv"); - resmem_sh_op()(cpu_ctx, s_qq_nt, ntype * this->nhm * this->nhm, "VNL::s_qq_nt"); - resmem_ch_op()(cpu_ctx, - c_deeq_nc, + resmem_sh_op()(s_nhtol, ntype * this->nhm, "VNL::s_nhtol"); + resmem_sh_op()(s_nhtolm, ntype * this->nhm, "VNL::s_nhtolm"); + resmem_sh_op()(s_indv, ntype * this->nhm, "VNL::s_indv"); + resmem_sh_op()(s_qq_nt, ntype * this->nhm * this->nhm, "VNL::s_qq_nt"); + resmem_ch_op()(c_deeq_nc, PARAM.inp.nspin * ucell.nat * this->nhm * this->nhm, "VNL::c_deeq_nc"); - resmem_ch_op()(cpu_ctx, c_qq_so, ntype * 4 * this->nhm * this->nhm, "VNL::c_qq_so"); + resmem_ch_op()(c_qq_so, ntype * 4 * this->nhm * this->nhm, "VNL::c_qq_so"); } else { @@ -275,18 +273,18 @@ void pseudopot_cell_vnl::init(const UnitCell& ucell, { if (PARAM.inp.precision == "single") { - resmem_sd_op()(gpu_ctx, s_tab, this->tab.getSize()); - resmem_cd_op()(gpu_ctx, c_vkb, nkb * npwx); + resmem_sd_op()(s_tab, this->tab.getSize()); + resmem_cd_op()(c_vkb, nkb * npwx); } - resmem_zd_op()(gpu_ctx, z_vkb, nkb * npwx); - resmem_dd_op()(gpu_ctx, d_tab, this->tab.getSize()); + resmem_zd_op()(z_vkb, nkb * npwx); + resmem_dd_op()(d_tab, this->tab.getSize()); } else { if (PARAM.inp.precision == "single") { - resmem_sh_op()(cpu_ctx, s_tab, this->tab.getSize()); - resmem_ch_op()(cpu_ctx, c_vkb, nkb * npwx); + resmem_sh_op()(s_tab, this->tab.getSize()); + resmem_ch_op()(c_vkb, nkb * npwx); } this->z_vkb = this->vkb.c; this->d_tab = this->tab.ptr; @@ -339,7 +337,7 @@ void pseudopot_cell_vnl::getvnl(const int& ik, const UnitCell& ucell, ModuleBase using resmem_complex_op = base_device::memory::resize_memory_op, Device>; using delmem_complex_op = base_device::memory::delete_memory_op, Device>; std::complex* sk = nullptr; - resmem_complex_op()(ctx, sk, ucell.nat * npw, "VNL::sk"); + resmem_complex_op()(sk, ucell.nat * npw, "VNL::sk"); this->psf->get_sk(ctx, ik, this->wfcpw, sk); int jkb = 0, iat = 0; @@ -404,7 +402,7 @@ void pseudopot_cell_vnl::getvnl(const int& ik, const UnitCell& ucell, ModuleBase delete[] gk; delete[] vq; - delmem_complex_op()(ctx, sk); + delmem_complex_op()(sk); ModuleBase::timer::tick("pp_cell_vnl", "getvnl"); return; @@ -457,8 +455,8 @@ void pseudopot_cell_vnl::getvnl(Device* ctx, FPTYPE *vkb1 = nullptr, *gk = nullptr, *ylm = nullptr, *_tab = this->get_tab_data(), *_indv = this->get_indv_data(), *_nhtol = this->get_nhtol_data(), *_nhtolm = this->get_nhtolm_data(); - resmem_var_op()(ctx, ylm, x1 * npw, "VNL::ylm"); - resmem_var_op()(ctx, vkb1, nhm * npw, "VNL::vkb1"); + resmem_var_op()(ylm, x1 * npw, "VNL::ylm"); + resmem_var_op()(vkb1, nhm * npw, "VNL::vkb1"); ModuleBase::Vector3* _gk = new ModuleBase::Vector3[npw]; #ifdef _OPENMP @@ -470,15 +468,15 @@ void pseudopot_cell_vnl::getvnl(Device* ctx, } if (PARAM.inp.device == "gpu") { - resmem_int_op()(ctx, atom_nh, ucell.ntype); - resmem_int_op()(ctx, atom_nb, ucell.ntype); - resmem_int_op()(ctx, atom_na, ucell.ntype); - syncmem_int_op()(ctx, cpu_ctx, atom_nh, h_atom_nh, ucell.ntype); - syncmem_int_op()(ctx, cpu_ctx, atom_nb, h_atom_nb, ucell.ntype); - syncmem_int_op()(ctx, cpu_ctx, atom_na, h_atom_na, ucell.ntype); - - resmem_var_op()(ctx, gk, npw * 3); - castmem_var_h2d_op()(ctx, cpu_ctx, gk, reinterpret_cast(_gk), npw * 3); + resmem_int_op()(atom_nh, ucell.ntype); + resmem_int_op()(atom_nb, ucell.ntype); + resmem_int_op()(atom_na, ucell.ntype); + syncmem_int_op()(atom_nh, h_atom_nh, ucell.ntype); + syncmem_int_op()(atom_nb, h_atom_nb, ucell.ntype); + syncmem_int_op()(atom_na, h_atom_na, ucell.ntype); + + resmem_var_op()(gk, npw * 3); + castmem_var_h2d_op()(gk, reinterpret_cast(_gk), npw * 3); } else { @@ -487,8 +485,8 @@ void pseudopot_cell_vnl::getvnl(Device* ctx, atom_na = h_atom_na; if (PARAM.inp.precision == "single") { - resmem_var_op()(ctx, gk, npw * 3); - castmem_var_h2h_op()(cpu_ctx, cpu_ctx, gk, reinterpret_cast(_gk), npw * 3); + resmem_var_op()(gk, npw * 3); + castmem_var_h2h_op()(gk, reinterpret_cast(_gk), npw * 3); } else { @@ -499,7 +497,7 @@ void pseudopot_cell_vnl::getvnl(Device* ctx, ModuleBase::YlmReal::Ylm_Real(ctx, x1, npw, gk, ylm); std::complex* sk = nullptr; - resmem_complex_op()(ctx, sk, ucell.nat * npw); + resmem_complex_op()(sk, ucell.nat * npw); this->psf->get_sk(ctx, ik, this->wfcpw, sk); cal_vnl_op()(ctx, @@ -529,18 +527,18 @@ void pseudopot_cell_vnl::getvnl(Device* ctx, delete[] h_atom_nh; delete[] h_atom_na; delete[] h_atom_nb; - delmem_var_op()(ctx, ylm); - delmem_var_op()(ctx, vkb1); - delmem_complex_op()(ctx, sk); + delmem_var_op()(ylm); + delmem_var_op()(vkb1); + delmem_complex_op()(sk); if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") { - delmem_var_op()(ctx, gk); + delmem_var_op()(gk); } if (PARAM.inp.device == "gpu") { - delmem_int_op()(ctx, atom_nh); - delmem_int_op()(ctx, atom_nb); - delmem_int_op()(ctx, atom_na); + delmem_int_op()(atom_nh); + delmem_int_op()(atom_nb); + delmem_int_op()(atom_na); } ModuleBase::timer::tick("pp_cell_vnl", "getvnl"); } // end subroutine getvnl @@ -874,36 +872,36 @@ void pseudopot_cell_vnl::init_vnl(UnitCell& cell, const ModulePW::PW_Basis* rho_ { if (PARAM.inp.precision == "single") { - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_indv, this->indv.c, this->indv.nr * this->indv.nc); - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc); - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc); - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_tab, this->tab.ptr, this->tab.getSize()); - castmem_d2s_h2d_op()(gpu_ctx, cpu_ctx, this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize()); - castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize()); + castmem_d2s_h2d_op()(this->s_indv, this->indv.c, this->indv.nr * this->indv.nc); + castmem_d2s_h2d_op()(this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc); + castmem_d2s_h2d_op()(this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc); + castmem_d2s_h2d_op()(this->s_tab, this->tab.ptr, this->tab.getSize()); + castmem_d2s_h2d_op()(this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize()); + castmem_z2c_h2d_op()(this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize()); } else { - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_qq_so, this->qq_so.ptr, this->qq_so.getSize()); + syncmem_z2z_h2d_op()(this->z_qq_so, this->qq_so.ptr, this->qq_so.getSize()); } // Even when the single precision flag is enabled, // these variables are utilized in the Force/Stress calculation as well. // modified by denghuilu at 2023-05-15 - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_indv, this->indv.c, this->indv.nr * this->indv.nc); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_tab, this->tab.ptr, this->tab.getSize()); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, this->d_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize()); + syncmem_d2d_h2d_op()(this->d_indv, this->indv.c, this->indv.nr * this->indv.nc); + syncmem_d2d_h2d_op()(this->d_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc); + syncmem_d2d_h2d_op()(this->d_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc); + syncmem_d2d_h2d_op()(this->d_tab, this->tab.ptr, this->tab.getSize()); + syncmem_d2d_h2d_op()(this->d_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize()); } else { if (PARAM.inp.precision == "single") { - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_indv, this->indv.c, this->indv.nr * this->indv.nc); - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc); - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc); - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_tab, this->tab.ptr, this->tab.getSize()); - castmem_d2s_h2h_op()(cpu_ctx, cpu_ctx, this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize()); - castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize()); + castmem_d2s_h2h_op()(this->s_indv, this->indv.c, this->indv.nr * this->indv.nc); + castmem_d2s_h2h_op()(this->s_nhtol, this->nhtol.c, this->nhtol.nr * this->nhtol.nc); + castmem_d2s_h2h_op()(this->s_nhtolm, this->nhtolm.c, this->nhtolm.nr * this->nhtolm.nc); + castmem_d2s_h2h_op()(this->s_tab, this->tab.ptr, this->tab.getSize()); + castmem_d2s_h2h_op()(this->s_qq_nt, this->qq_nt.ptr, this->qq_nt.getSize()); + castmem_z2c_h2h_op()(this->c_qq_so, this->qq_so.ptr, this->qq_so.getSize()); } // There's no need to synchronize double precision pointers while in a CPU environment. } @@ -1082,7 +1080,7 @@ void pseudopot_cell_vnl::radial_fft_q(Device* ctx, const int ivl = nhtolm(itype, ih); const int jvl = nhtolm(itype, jh); - setmem_complex_op()(ctx, qg, 0, ng); + setmem_complex_op()(qg, 0, ng); const double* qnorm_double = reinterpret_cast(qnorm); @@ -1492,28 +1490,20 @@ void pseudopot_cell_vnl::cal_effective_D(const ModuleBase::matrix& veff, { if (PARAM.inp.precision == "single") { - castmem_d2s_h2d_op()(gpu_ctx, - cpu_ctx, - this->s_deeq, + castmem_d2s_h2d_op()(this->s_deeq, this->deeq.ptr, PARAM.inp.nspin * cell.nat * this->nhm * this->nhm); - castmem_z2c_h2d_op()(gpu_ctx, - cpu_ctx, - this->c_deeq_nc, + castmem_z2c_h2d_op()(this->c_deeq_nc, this->deeq_nc.ptr, PARAM.inp.nspin * cell.nat * this->nhm * this->nhm); } else { - syncmem_z2z_h2d_op()(gpu_ctx, - cpu_ctx, - this->z_deeq_nc, + syncmem_z2z_h2d_op()(this->z_deeq_nc, this->deeq_nc.ptr, PARAM.inp.nspin * cell.nat * this->nhm * this->nhm); } - syncmem_d2d_h2d_op()(gpu_ctx, - cpu_ctx, - this->d_deeq, + syncmem_d2d_h2d_op()(this->d_deeq, this->deeq.ptr, PARAM.inp.nspin * cell.nat * this->nhm * this->nhm); } @@ -1521,14 +1511,10 @@ void pseudopot_cell_vnl::cal_effective_D(const ModuleBase::matrix& veff, { if (PARAM.inp.precision == "single") { - castmem_d2s_h2h_op()(cpu_ctx, - cpu_ctx, - this->s_deeq, + castmem_d2s_h2h_op()(this->s_deeq, this->deeq.ptr, PARAM.inp.nspin * cell.nat * this->nhm * this->nhm); - castmem_z2c_h2h_op()(cpu_ctx, - cpu_ctx, - this->c_deeq_nc, + castmem_z2c_h2h_op()(this->c_deeq_nc, this->deeq_nc.ptr, PARAM.inp.nspin * cell.nat * this->nhm * this->nhm); } diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp index 3346724deb..41184b11d0 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_cc.cpp @@ -134,17 +134,17 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, } if(this->device == base_device::GpuDevice ) { - resmem_var_op()(this->ctx, gv_x_d, rho_basis->npw); - resmem_var_op()(this->ctx, gv_y_d, rho_basis->npw); - resmem_var_op()(this->ctx, gv_z_d, rho_basis->npw); - resmem_var_op()(this->ctx, rhocgigg_vec_d, rho_basis->npw); - resmem_complex_op()(this->ctx, psiv_d, rho_basis->nmaxgr); - resmem_var_op()(this->ctx, force_d, 3); - - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_x_d, gv_x.data(), rho_basis->npw); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_y_d, gv_y.data(), rho_basis->npw); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gv_z_d, gv_z.data(), rho_basis->npw); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, psiv_d, psiv, rho_basis->nmaxgr); + resmem_var_op()(gv_x_d, rho_basis->npw); + resmem_var_op()(gv_y_d, rho_basis->npw); + resmem_var_op()(gv_z_d, rho_basis->npw); + resmem_var_op()(rhocgigg_vec_d, rho_basis->npw); + resmem_complex_op()(psiv_d, rho_basis->nmaxgr); + resmem_var_op()(force_d, 3); + + syncmem_var_h2d_op()(gv_x_d, gv_x.data(), rho_basis->npw); + syncmem_var_h2d_op()(gv_y_d, gv_y.data(), rho_basis->npw); + syncmem_var_h2d_op()(gv_z_d, gv_z.data(), rho_basis->npw); + syncmem_complex_h2d_op()(psiv_d, psiv, rho_basis->nmaxgr); } @@ -178,7 +178,7 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, } if(this->device == base_device::GpuDevice ) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw); + syncmem_var_h2d_op()(rhocgigg_vec_d, rhocgigg_vec.data(), rho_basis->npw); } for (int ia = 0; ia < ucell_in.atoms[it].na; ++ia) { @@ -188,12 +188,12 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, double force[3] = {0, 0, 0}; if(this->device == base_device::GpuDevice ) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, force_d, force, 3); + syncmem_var_h2d_op()(force_d, force, 3); hamilt::cal_force_npw_op()( psiv_d, gv_x_d, gv_y_d, gv_z_d, rhocgigg_vec_d, force_d, pos.x, pos.y, pos.z, rho_basis->npw, ucell_in.omega, ucell_in.tpiba ); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force, force_d, 3); + syncmem_var_d2h_op()(force, force_d, 3); } else { hamilt::cal_force_npw_op()( @@ -213,12 +213,12 @@ void Forces::cal_force_cc(ModuleBase::matrix& forcecc, } if (this->device == base_device::GpuDevice) { - delmem_var_op()(this->ctx, gv_x_d); - delmem_var_op()(this->ctx, gv_y_d); - delmem_var_op()(this->ctx, gv_z_d); - delmem_var_op()(this->ctx, force_d); - delmem_var_op()(this->ctx, rhocgigg_vec_d); - delmem_complex_op()(this->ctx, psiv_d); + delmem_var_op()(gv_x_d); + delmem_var_op()(gv_y_d); + delmem_var_op()(gv_z_d); + delmem_var_op()(force_d); + delmem_var_op()(rhocgigg_vec_d); + delmem_complex_op()(psiv_d); } delete[] rhocg; @@ -308,24 +308,24 @@ void Forces::deriv_drhoc double *aux_d = nullptr; double *drhocg_d = nullptr; if(this->device == base_device::GpuDevice ) { - resmem_var_op()(this->ctx, r_d, mesh); - resmem_var_op()(this->ctx, rhoc_d, mesh); - resmem_var_op()(this->ctx, rab_d, mesh); - - resmem_var_op()(this->ctx, aux_d, mesh); - resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg); - resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg); - - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gx_arr_d, gx_arr.data(), rho_basis->ngg); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, mesh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, mesh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, rhoc, mesh); + resmem_var_op()(r_d, mesh); + resmem_var_op()(rhoc_d, mesh); + resmem_var_op()(rab_d, mesh); + + resmem_var_op()(aux_d, mesh); + resmem_var_op()(gx_arr_d, rho_basis->ngg); + resmem_var_op()(drhocg_d, rho_basis->ngg); + + syncmem_var_h2d_op()(gx_arr_d, gx_arr.data(), rho_basis->ngg); + syncmem_var_h2d_op()(r_d, r, mesh); + syncmem_var_h2d_op()(rab_d, rab, mesh); + syncmem_var_h2d_op()(rhoc_d, rhoc, mesh); } if(this->device == base_device::GpuDevice) { hamilt::cal_stress_drhoc_aux_op()( r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); + syncmem_var_d2h_op()(drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); @@ -334,11 +334,11 @@ void Forces::deriv_drhoc r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,type); } - delmem_var_op()(this->ctx, r_d); - delmem_var_op()(this->ctx, rhoc_d); - delmem_var_op()(this->ctx, rab_d); - delmem_var_op()(this->ctx, gx_arr_d); - delmem_var_op()(this->ctx, drhocg_d); + delmem_var_op()(r_d); + delmem_var_op()(rhoc_d); + delmem_var_op()(rab_d); + delmem_var_op()(gx_arr_d); + delmem_var_op()(drhocg_d); return; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp index 8ecba030f3..bd615f0eef 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_nl.cpp @@ -27,8 +27,8 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, // allocate memory for the force FPTYPE* force = nullptr; - resmem_var_op()(this->ctx, force, ucell_in.nat * 3); - base_device::memory::set_memory_op()(this->ctx, force, 0.0, ucell_in.nat * 3); + resmem_var_op()(force, ucell_in.nat * 3); + base_device::memory::set_memory_op()(force, 0.0, ucell_in.nat * 3); hamilt::FS_Nonlocal_tools nl_tools(&nlpp, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb); @@ -62,8 +62,8 @@ void Forces::cal_force_nl(ModuleBase::matrix& forcenl, nl_tools.cal_force(ik, max_nbands, npm, true, force); } // end ik - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc); - delmem_var_op()(this->ctx, force); + syncmem_var_d2h_op()(forcenl.c, force, forcenl.nr * forcenl.nc); + delmem_var_op()(force); // sum up forcenl from all processors Parallel_Reduce::reduce_all(forcenl.c, forcenl.nr * forcenl.nc); diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp index 240187b3ba..36f90f0001 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp @@ -23,8 +23,8 @@ void Forces::cal_force_onsite(ModuleBase::matrix& force_onsite, // allocate memory for the force FPTYPE* force = nullptr; - resmem_var_op()(this->ctx, force, ucell_in.nat * 3); - base_device::memory::set_memory_op()(this->ctx, force, 0.0, ucell_in.nat * 3); + resmem_var_op()(force, ucell_in.nat * 3); + base_device::memory::set_memory_op()(force, 0.0, ucell_in.nat * 3); auto* onsite_p = projectors::OnsiteProjector::get_instance(); @@ -65,8 +65,8 @@ void Forces::cal_force_onsite(ModuleBase::matrix& force_onsite, } // end ik - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_onsite.c, force, force_onsite.nr * force_onsite.nc); - delmem_var_op()(this->ctx, force); + syncmem_var_d2h_op()(force_onsite.c, force, force_onsite.nr * force_onsite.nc); + delmem_var_op()(force); // sum up force_onsite from all processors Parallel_Reduce::reduce_all(force_onsite.c, force_onsite.nr * force_onsite.nc); diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp index f670ad9b27..ab63f43aff 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_scc.cpp @@ -190,28 +190,26 @@ void Forces::deriv_drhoc_scc(const bool& numeric, double *aux_d = nullptr; double *drhocg_d = nullptr; if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, r_d, mesh); - resmem_var_op()(this->ctx, rhoc_d, mesh); - resmem_var_op()(this->ctx, rab_d, mesh); + resmem_var_op()(r_d, mesh); + resmem_var_op()(rhoc_d, mesh); + resmem_var_op()(rab_d, mesh); - resmem_var_op()(this->ctx, aux_d, mesh); - resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg); - resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg); + resmem_var_op()(aux_d, mesh); + resmem_var_op()(gx_arr_d, rho_basis->ngg); + resmem_var_op()(drhocg_d, rho_basis->ngg); - syncmem_var_h2d_op()(this->ctx, - this->cpu_ctx, - gx_arr_d, + syncmem_var_h2d_op()(gx_arr_d, gx_arr.data(), rho_basis->ngg); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, mesh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, mesh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, rhoc, mesh); + syncmem_var_h2d_op()(r_d, r, mesh); + syncmem_var_h2d_op()(rab_d, rab, mesh); + syncmem_var_h2d_op()(rhoc_d, rhoc, mesh); } if(this->device == base_device::GpuDevice) { hamilt::cal_stress_drhoc_aux_op()( r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,ucell_in.omega,2); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); + syncmem_var_d2h_op()(drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); } else { hamilt::cal_stress_drhoc_aux_op()( @@ -219,11 +217,11 @@ void Forces::deriv_drhoc_scc(const bool& numeric, } - delmem_var_op()(this->ctx, r_d); - delmem_var_op()(this->ctx, rhoc_d); - delmem_var_op()(this->ctx, rab_d); - delmem_var_op()(this->ctx, gx_arr_d); - delmem_var_op()(this->ctx, drhocg_d); + delmem_var_op()(r_d); + delmem_var_op()(rhoc_d); + delmem_var_op()(rab_d); + delmem_var_op()(gx_arr_d); + delmem_var_op()(drhocg_d); return; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp index 89efb3f879..00049866f9 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_kin_tools.cpp @@ -27,8 +27,8 @@ FS_Kin_tools::FS_Kin_tools(const UnitCell& ucell_in, if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, d_gk, 3 * npwk_max); - resmem_var_op()(this->ctx, d_kfac, npwk_max); + resmem_var_op()(d_gk, 3 * npwk_max); + resmem_var_op()(d_kfac, npwk_max); } else { @@ -42,8 +42,8 @@ FS_Kin_tools::~FS_Kin_tools() { if (this->device == base_device::GpuDevice) { - delmem_var_op()(this->ctx, d_gk); - delmem_var_op()(this->ctx, d_kfac); + delmem_var_op()(d_gk); + delmem_var_op()(d_kfac); } } @@ -72,8 +72,8 @@ void FS_Kin_tools::cal_gk(const int& ik) } if (this->device == base_device::GpuDevice) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_gk, gk[0], 3 * npwk_max); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_kfac, kfac.data(), npwk_max); + syncmem_var_h2d_op()(d_gk, gk[0], 3 * npwk_max); + syncmem_var_h2d_op()(d_kfac, kfac.data(), npwk_max); } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index 810b313292..523cb2b504 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -73,36 +73,36 @@ void FS_Nonlocal_tools::allocate_memory(const ModuleBase::matrix // allocate the memory for vkb and vkb_deri. if (this->device == base_device::GpuDevice) { - resmem_int_op()(this->ctx, this->d_dvkb_indexes, max_nh * 4); + resmem_int_op()(this->d_dvkb_indexes, max_nh * 4); } - resmem_var_op()(this->ctx, this->hd_vq, max_nbeta * max_npw); - resmem_var_op()(this->ctx, this->hd_vq_deri, max_nbeta * max_npw); + resmem_var_op()(this->hd_vq, max_nbeta * max_npw); + resmem_var_op()(this->hd_vq_deri, max_nbeta * max_npw); const int _lmax = this->nlpp_->lmaxkb; - resmem_var_op()(this->ctx, this->hd_ylm, (_lmax + 1) * (_lmax + 1) * max_npw); - resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (_lmax + 1) * (_lmax + 1) * max_npw); + resmem_var_op()(this->hd_ylm, (_lmax + 1) * (_lmax + 1) * max_npw); + resmem_var_op()(this->hd_ylm_deri, 3 * (_lmax + 1) * (_lmax + 1) * max_npw); const int nks = this->kv_->get_nks(); - resmem_var_op()(this->ctx, d_wk, nks); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wk, this->kv_->wk.data(), nks); + resmem_var_op()(d_wk, nks); + syncmem_var_h2d_op()(d_wk, this->kv_->wk.data(), nks); if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc); + resmem_var_op()(d_wg, wg.nr * wg.nc); + syncmem_var_h2d_op()(d_wg, wg.c, wg.nr * wg.nc); if (p_ekb != nullptr) { - resmem_var_op()(this->ctx, d_ekb, p_ekb->nr * p_ekb->nc); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, p_ekb->c, p_ekb->nr * p_ekb->nc); + resmem_var_op()(d_ekb, p_ekb->nr * p_ekb->nc); + syncmem_var_h2d_op()(d_ekb, p_ekb->c, p_ekb->nr * p_ekb->nc); } - resmem_int_op()(this->ctx, atom_nh, this->ntype); - resmem_int_op()(this->ctx, atom_na, this->ntype); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_na, h_atom_na.data(), this->ntype); + resmem_int_op()(atom_nh, this->ntype); + resmem_int_op()(atom_na, this->ntype); + syncmem_int_h2d_op()(atom_nh, h_atom_nh.data(), this->ntype); + syncmem_int_h2d_op()(atom_na, h_atom_na.data(), this->ntype); - resmem_var_op()(this->ctx, d_g_plus_k, max_npw * 5); - resmem_var_op()(this->ctx, d_pref, max_nh); - resmem_var_op()(this->ctx, d_vq_tab, this->nlpp_->tab.getSize()); - resmem_complex_op()(this->ctx, d_pref_in, max_nh); + resmem_var_op()(d_g_plus_k, max_npw * 5); + resmem_var_op()(d_pref, max_nh); + resmem_var_op()(d_vq_tab, this->nlpp_->tab.getSize()); + resmem_complex_op()(d_pref_in, max_nh); this->ppcell_vkb = this->nlpp_->template get_vkb_data(); } @@ -124,40 +124,40 @@ void FS_Nonlocal_tools::delete_memory() { // delete memory - delmem_var_op()(this->ctx, hd_vq); - delmem_var_op()(this->ctx, hd_vq_deri); - delmem_var_op()(this->ctx, hd_ylm); - delmem_var_op()(this->ctx, hd_ylm_deri); - delmem_var_op()(this->ctx, d_wk); + delmem_var_op()(hd_vq); + delmem_var_op()(hd_vq_deri); + delmem_var_op()(hd_ylm); + delmem_var_op()(hd_ylm_deri); + delmem_var_op()(d_wk); // delete memory on GPU if (this->device == base_device::GpuDevice) { - delmem_var_op()(this->ctx, d_wg); - delmem_var_op()(this->ctx, d_ekb); - delmem_int_op()(this->ctx, atom_nh); - delmem_int_op()(this->ctx, atom_na); - delmem_var_op()(this->ctx, d_g_plus_k); - delmem_var_op()(this->ctx, d_pref); - delmem_var_op()(this->ctx, d_vq_tab); - delmem_complex_op()(this->ctx, this->d_pref_in); - delmem_int_op()(this->ctx, d_dvkb_indexes); + delmem_var_op()(d_wg); + delmem_var_op()(d_ekb); + delmem_int_op()(atom_nh); + delmem_int_op()(atom_na); + delmem_var_op()(d_g_plus_k); + delmem_var_op()(d_pref); + delmem_var_op()(d_vq_tab); + delmem_complex_op()(this->d_pref_in); + delmem_int_op()(d_dvkb_indexes); } if (becp != nullptr) { - delmem_complex_op()(this->ctx, becp); - delmem_complex_op()(this->ctx, hd_sk); + delmem_complex_op()(becp); + delmem_complex_op()(hd_sk); } if (dbecp != nullptr) { - delmem_complex_op()(this->ctx, dbecp); + delmem_complex_op()(dbecp); } if (this->pre_ik_f != -1) { - delmem_int_op()(this->ctx, gcar_zero_indexes); - delmem_complex_op()(this->ctx, vkb_save); - delmem_var_op()(this->ctx, gcar); + delmem_int_op()(gcar_zero_indexes); + delmem_complex_op()(vkb_save); + delmem_var_op()(gcar); } } @@ -170,7 +170,7 @@ void FS_Nonlocal_tools::cal_vkb(const int& ik, const int& nbdall const int size_becp = nbdall * npol * this->nkb; if (this->becp == nullptr) { - resmem_complex_op()(this->ctx, becp, size_becp); + resmem_complex_op()(becp, size_becp); } // prepare math tools @@ -183,7 +183,7 @@ void FS_Nonlocal_tools::cal_vkb(const int& ik, const int& nbdall this->g_plus_k = maths.cal_gk(ik, this->wfc_basis_); FPTYPE *gk = g_plus_k.data(), *vq_tb = this->nlpp_->tab.ptr; // calculate sk - resmem_complex_op()(ctx, hd_sk, this->ucell_->nat * npw); + resmem_complex_op()(hd_sk, this->ucell_->nat * npw); this->sf_->get_sk(ctx, ik, this->wfc_basis_, hd_sk); std::complex* d_sk = this->hd_sk; // prepare ylm,size: (lmax+1)^2 * this->max_npw @@ -191,8 +191,8 @@ void FS_Nonlocal_tools::cal_vkb(const int& ik, const int& nbdall maths.cal_ylm(lmax_, npw, g_plus_k.data(), hd_ylm); if (this->device == base_device::GpuDevice) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_g_plus_k, g_plus_k.data(), g_plus_k.size()); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vq_tab, this->nlpp_->tab.ptr, this->nlpp_->tab.getSize()); + syncmem_var_h2d_op()(d_g_plus_k, g_plus_k.data(), g_plus_k.size()); + syncmem_var_h2d_op()(d_vq_tab, this->nlpp_->tab.ptr, this->nlpp_->tab.getSize()); gk = d_g_plus_k; vq_tb = d_vq_tab; } @@ -229,8 +229,8 @@ void FS_Nonlocal_tools::cal_vkb(const int& ik, const int& nbdall this->dvkb_indexes.data()); if (this->device == base_device::GpuDevice) { - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh); + syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4); + syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh); } for (int ia = 0; ia < h_atom_na[it]; ia++) @@ -312,7 +312,7 @@ void FS_Nonlocal_tools::cal_vkb_deri_s(const int& ik, const int size_becp = nbdall * npol * this->nkb; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, size_becp); + resmem_complex_op()(dbecp, size_becp); } // prepare math tools @@ -383,8 +383,8 @@ void FS_Nonlocal_tools::cal_vkb_deri_s(const int& ik, this->dvkb_indexes.data()); if (this->device == base_device::GpuDevice) { - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh); + syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4); + syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh); } for (int ia = 0; ia < h_atom_na[it]; ia++) { @@ -538,7 +538,7 @@ void FS_Nonlocal_tools::cal_vkb_deri_f(const int& ik, const int& const int size_becp = nbdall * npol * this->nkb; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); + resmem_complex_op()(dbecp, 3 * size_becp); } const std::complex* vkb_ptr = this->ppcell_vkb; @@ -547,8 +547,8 @@ void FS_Nonlocal_tools::cal_vkb_deri_f(const int& ik, const int& const int npw = this->wfc_basis_->npwk[ik]; if (this->pre_ik_f == -1) { - resmem_var_op()(this->ctx, gcar, 3 * this->wfc_basis_->npwk_max); - resmem_int_op()(this->ctx, gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max); + resmem_var_op()(gcar, 3 * this->wfc_basis_->npwk_max); + resmem_int_op()(gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max); } if (this->pre_ik_f != ik) @@ -730,10 +730,10 @@ void FS_Nonlocal_tools::transfer_gcar(const int& npw, const int& } // prepare the memory for vkb_save const int max_count = std::max(gcar_zero_counts[0], std::max(gcar_zero_counts[1], gcar_zero_counts[2])); - resmem_complex_op()(this->ctx, this->vkb_save, this->nkb * max_count); + resmem_complex_op()(this->vkb_save, this->nkb * max_count); // transfer the gcar and gcar_zero_indexes to the device - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar, gcar_tmp.data(), 3 * npw_max); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max); + syncmem_var_h2d_op()(gcar, gcar_tmp.data(), 3 * npw_max); + syncmem_int_h2d_op()(gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max); } // cal_force diff --git a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp index 38ccd9632c..f877eb1985 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp @@ -246,7 +246,7 @@ void HamiltPW::sPsi(const T* psi_in, // psi return; } - syncmem_op()(this->ctx, this->ctx, spsi, psi_in, static_cast(nbands * nrow)); + syncmem_op()(spsi, psi_in, static_cast(nbands * nrow)); if (PARAM.globalv.use_uspp) { T* becp = nullptr; @@ -254,7 +254,7 @@ void HamiltPW::sPsi(const T* psi_in, // psi // psi updated, thus update if (this->ppcell->nkb > 0) { - resmem_complex_op()(this->ctx, becp, nbands * this->ppcell->nkb, "Hamilt::becp"); + resmem_complex_op()(becp, nbands * this->ppcell->nkb, "Hamilt::becp"); char transa = 'C'; char transb = 'N'; if (nbands == 1) @@ -294,8 +294,8 @@ void HamiltPW::sPsi(const T* psi_in, // psi Parallel_Reduce::reduce_pool(becp, this->ppcell->nkb * nbands); } - resmem_complex_op()(this->ctx, ps, this->ppcell->nkb * nbands, "Hamilt::ps"); - setmem_complex_op()(this->ctx, ps, 0, this->ppcell->nkb * nbands); + resmem_complex_op()(ps, this->ppcell->nkb * nbands, "Hamilt::ps"); + setmem_complex_op()(ps, 0, this->ppcell->nkb * nbands); // spsi = psi + sum qq |beta> if (PARAM.inp.noncolin) @@ -316,7 +316,7 @@ void HamiltPW::sPsi(const T* psi_in, // psi { const int nh = atoms->ncpp.nh; T* qqc = nullptr; - resmem_complex_op()(this->ctx, qqc, nh * nh, "Hamilt::qqc"); + resmem_complex_op()(qqc, nh * nh, "Hamilt::qqc"); Real* qq_now = &qq_nt[it * this->ppcell->nhm * this->ppcell->nhm]; for (int i = 0; i < nh; i++) { @@ -344,7 +344,7 @@ void HamiltPW::sPsi(const T* psi_in, // psi &ps[this->ppcell->indv_ijkb0[iat]], this->ppcell->nkb); } - delmem_complex_op()(ctx, qqc); + delmem_complex_op()(qqc); } } @@ -382,8 +382,8 @@ void HamiltPW::sPsi(const T* psi_in, // psi nrow); } } - delmem_complex_op()(this->ctx, ps); - delmem_complex_op()(this->ctx, becp); + delmem_complex_op()(ps); + delmem_complex_op()(becp); } } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp index 324d4fb752..7c06dfc154 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/ekinetic_op_test.cpp @@ -81,22 +81,22 @@ TEST_F(TestModuleHamiltEkinetic, ekinetic_pw_op_gpu) { double* gk2_dev = NULL; std::complex* hpsi_dev = NULL, * psi_dev = NULL; - resize_memory_double_op()(gpu_ctx, gk2_dev, gk2.size()); - resize_memory_complex_double_op()(gpu_ctx, psi_dev, psi.size()); + resize_memory_double_op()(gk2_dev, gk2.size()); + resize_memory_complex_double_op()(psi_dev, psi.size()); std::vector > hpsi(expected_hpsi.size(), std::complex(0.0, 0.0)); - resize_memory_complex_double_op()(gpu_ctx, hpsi_dev, hpsi.size()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, hpsi_dev, hpsi.data(), hpsi.size()); - syncmem_d_h2d_op()(gpu_ctx, cpu_ctx, gk2_dev, gk2.data(), gk2.size()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, psi_dev, psi.data(), psi.size()); + resize_memory_complex_double_op()(hpsi_dev, hpsi.size()); + syncmem_cd_h2d_op()(hpsi_dev, hpsi.data(), hpsi.size()); + syncmem_d_h2d_op()(gk2_dev, gk2.data(), gk2.size()); + syncmem_cd_h2d_op()(psi_dev, psi.data(), psi.size()); // ekinetic_cpu_op()(cpu_ctx, band, dim, dim, tpiba2, gk2.data(), hpsi.data(), psi.data()); ekinetic_gpu_op()(gpu_ctx, band, dim, dim, false, tpiba2, gk2_dev, hpsi_dev, psi_dev); - syncmem_cd_d2h_op()(cpu_ctx, gpu_ctx, hpsi.data(), hpsi_dev, hpsi.size()); + syncmem_cd_d2h_op()(hpsi.data(), hpsi_dev, hpsi.size()); for (int ii = 0; ii < hpsi.size(); ii++) { EXPECT_LT(fabs(hpsi[ii] - expected_hpsi[ii]), 1e-6); } - delete_memory_double_op()(gpu_ctx, gk2_dev); - delete_memory_complex_double_op()(gpu_ctx, psi_dev); - delete_memory_complex_double_op()(gpu_ctx, hpsi_dev); + delete_memory_double_op()(gk2_dev); + delete_memory_complex_double_op()(psi_dev); + delete_memory_complex_double_op()(hpsi_dev); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp index 0507ff3358..be237b64ba 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/force_op_test.cpp @@ -2919,12 +2919,12 @@ TEST_F(TestSrcPWForceMultiDevice, cal_vkb1_nl_op_gpu) std::vector> res = vkb1; std::complex*d_res = nullptr, *d_vkb = nullptr; double* d_gcar = nullptr; - resmem_complex_op()(gpu_ctx, d_res, res.size()); - resmem_complex_op()(gpu_ctx, d_vkb, vkb.size()); - resmem_var_op()(gpu_ctx, d_gcar, gcar.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_vkb, vkb.data(), vkb.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_gcar, gcar.data(), gcar.size()); + resmem_complex_op()(d_res, res.size()); + resmem_complex_op()(d_vkb, vkb.size()); + resmem_var_op()(d_gcar, gcar.size()); + syncmem_complex_h2d_op()(d_res, res.data(), res.size()); + syncmem_complex_h2d_op()(d_vkb, vkb.data(), vkb.size()); + syncmem_var_h2d_op()(d_gcar, gcar.data(), gcar.size()); hamilt::cal_vkb1_nl_op()(gpu_ctx, nkb, @@ -2936,16 +2936,16 @@ TEST_F(TestSrcPWForceMultiDevice, cal_vkb1_nl_op_gpu) d_vkb, d_gcar, d_res); - syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); + syncmem_complex_d2h_op()(res.data(), d_res, res.size()); for (int ii = 0; ii < res.size(); ii++) { EXPECT_LT(fabs(res[ii] - expected_vkb1[ii]), 6e-5); } - delmem_complex_op()(gpu_ctx, d_res); - delmem_complex_op()(gpu_ctx, d_vkb); - delmem_var_op()(gpu_ctx, d_gcar); + delmem_complex_op()(d_res); + delmem_complex_op()(d_vkb); + delmem_var_op()(d_gcar); } TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) @@ -2953,28 +2953,28 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) std::vector res(expected_force.size(), 0); double *d_res = nullptr, *d_wg = nullptr, *d_deeq = nullptr; double *d_ekb = nullptr, *d_qq_nt = nullptr; - resmem_var_op()(gpu_ctx, d_wg, wg.size()); - resmem_var_op()(gpu_ctx, d_res, res.size()); - resmem_var_op()(gpu_ctx, d_deeq, deeq.size()); - resmem_var_op()(gpu_ctx, d_ekb, ekb.size()); - resmem_var_op()(gpu_ctx, d_qq_nt, qq_nt.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_wg, wg.data(), wg.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_deeq, deeq.data(), deeq.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_ekb, ekb.data(), ekb.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_qq_nt, qq_nt.data(), qq_nt.size()); + resmem_var_op()(d_wg, wg.size()); + resmem_var_op()(d_res, res.size()); + resmem_var_op()(d_deeq, deeq.size()); + resmem_var_op()(d_ekb, ekb.size()); + resmem_var_op()(d_qq_nt, qq_nt.size()); + syncmem_var_h2d_op()(d_wg, wg.data(), wg.size()); + syncmem_var_h2d_op()(d_res, res.data(), res.size()); + syncmem_var_h2d_op()(d_deeq, deeq.data(), deeq.size()); + syncmem_var_h2d_op()(d_ekb, ekb.data(), ekb.size()); + syncmem_var_h2d_op()(d_qq_nt, qq_nt.data(), qq_nt.size()); int *d_atom_nh = nullptr, *d_atom_na = nullptr; - resmem_int_op()(gpu_ctx, d_atom_nh, atom_nh.size()); - resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nh, atom_nh.data(), atom_nh.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size()); + resmem_int_op()(d_atom_nh, atom_nh.size()); + resmem_int_op()(d_atom_na, atom_na.size()); + syncmem_int_h2d_op()(d_atom_nh, atom_nh.data(), atom_nh.size()); + syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size()); std::complex*d_becp = nullptr, *d_dbecp = nullptr; - resmem_complex_op()(gpu_ctx, d_becp, becp.size()); - resmem_complex_op()(gpu_ctx, d_dbecp, dbecp.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_becp, becp.data(), becp.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_dbecp, dbecp.data(), dbecp.size()); + resmem_complex_op()(d_becp, becp.size()); + resmem_complex_op()(d_dbecp, dbecp.size()); + syncmem_complex_h2d_op()(d_becp, becp.data(), becp.size()); + syncmem_complex_h2d_op()(d_dbecp, dbecp.data(), dbecp.size()); hamilt::cal_force_nl_op()(gpu_ctx, multi_proj, @@ -2998,23 +2998,23 @@ TEST_F(TestSrcPWForceMultiDevice, cal_force_nl_op_gpu) d_becp, d_dbecp, d_res); - syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); + syncmem_var_d2h_op()(res.data(), d_res, res.size()); for (int ii = 0; ii < res.size(); ii++) { EXPECT_LT(fabs(res[ii] - expected_force[ii]), 6e-5); } - delmem_var_op()(gpu_ctx, d_wg); - delmem_var_op()(gpu_ctx, d_res); - delmem_var_op()(gpu_ctx, d_deeq); - delmem_var_op()(gpu_ctx, d_ekb); - delmem_var_op()(gpu_ctx, d_qq_nt); + delmem_var_op()(d_wg); + delmem_var_op()(d_res); + delmem_var_op()(d_deeq); + delmem_var_op()(d_ekb); + delmem_var_op()(d_qq_nt); - delmem_int_op()(gpu_ctx, d_atom_nh); - delmem_int_op()(gpu_ctx, d_atom_na); + delmem_int_op()(d_atom_nh); + delmem_int_op()(d_atom_na); - delmem_complex_op()(gpu_ctx, d_becp); - delmem_complex_op()(gpu_ctx, d_dbecp); + delmem_complex_op()(d_becp); + delmem_complex_op()(d_dbecp); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp index d9e9244004..85caa61f4b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/meta_op_test.cpp @@ -60,24 +60,24 @@ TEST_F(TestModuleHamiltMeta, meta_pw_op_gpu) std::vector> res(expected_out.size(), std::complex {0, 0}); double * d_gcar = nullptr, * d_kvec_c = nullptr; std::complex* d_in = nullptr, * d_res = nullptr; - resmem_var_op()(gpu_ctx, d_gcar, gcar.size()); - resmem_var_op()(gpu_ctx, d_kvec_c, kvec_c.size()); - resmem_complex_op()(gpu_ctx, d_in, in.size()); - resmem_complex_op()(gpu_ctx, d_res, res.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_gcar, gcar.data(), gcar.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_kvec_c, kvec_c.data(), kvec_c.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_in, in.data(), in.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); + resmem_var_op()(d_gcar, gcar.size()); + resmem_var_op()(d_kvec_c, kvec_c.size()); + resmem_complex_op()(d_in, in.size()); + resmem_complex_op()(d_res, res.size()); + syncmem_var_h2d_op()(d_gcar, gcar.data(), gcar.size()); + syncmem_var_h2d_op()(d_kvec_c, kvec_c.data(), kvec_c.size()); + syncmem_complex_h2d_op()(d_in, in.data(), in.size()); + syncmem_complex_h2d_op()(d_res, res.data(), res.size()); meta_gpu_op()(gpu_ctx, ik, pol, npw, npwx, tpiba, d_gcar, d_kvec_c, d_in, d_res); - syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); + syncmem_complex_d2h_op()(res.data(), d_res, res.size()); for (int ii = 0; ii < res.size(); ii++) { EXPECT_LT(fabs(res[ii] - expected_out[ii]), 6e-5); } - delmem_var_op()(gpu_ctx, d_gcar); - delmem_var_op()(gpu_ctx, d_kvec_c); - delmem_complex_op()(gpu_ctx, d_in); - delmem_complex_op()(gpu_ctx, d_res); + delmem_var_op()(d_gcar); + delmem_var_op()(d_kvec_c); + delmem_complex_op()(d_in); + delmem_complex_op()(d_res); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp index 8591182d4b..47deaec255 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/nonlocal_op_test.cpp @@ -127,12 +127,12 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_op_gpu) double* deeq_dev = NULL; std::complex* ps_dev = NULL, * becp_dev = NULL; std::vector> ps(expected_ps.size(), std::complex(0.0, 0.0)); - resize_memory_double_op()(gpu_ctx, deeq_dev, deeq.size()); - resize_memory_complex_double_op()(gpu_ctx, ps_dev, ps.size()); - resize_memory_complex_double_op()(gpu_ctx, becp_dev, becp.size()); - syncmem_d_h2d_op()(gpu_ctx, cpu_ctx, deeq_dev, deeq.data(), deeq.size()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, ps_dev, ps.data(), ps.size()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, becp_dev, becp.data(), becp.size()); + resize_memory_double_op()(deeq_dev, deeq.size()); + resize_memory_complex_double_op()(ps_dev, ps.size()); + resize_memory_complex_double_op()(becp_dev, becp.size()); + syncmem_d_h2d_op()(deeq_dev, deeq.data(), deeq.size()); + syncmem_cd_h2d_op()(ps_dev, ps.data(), ps.size()); + syncmem_cd_h2d_op()(becp_dev, becp.data(), becp.size()); nonlocal_gpu_op()( gpu_ctx, l1, l2, l3, @@ -141,15 +141,15 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_op_gpu) deeq_dev, ps_dev, becp_dev); - syncmem_cd_d2h_op()(cpu_ctx, gpu_ctx, ps.data(), ps_dev, ps.size()); + syncmem_cd_d2h_op()(ps.data(), ps_dev, ps.size()); for (int ii = 0; ii < ps.size(); ii++) { EXPECT_LT(fabs(ps[ii] - expected_ps[ii]), 5 * 1e-6); } EXPECT_EQ(sum, expected_sum); EXPECT_EQ(iat, expected_iat); - delete_memory_double_op()(gpu_ctx, deeq_dev); - delete_memory_complex_double_op()(gpu_ctx, ps_dev); - delete_memory_complex_double_op()(gpu_ctx, becp_dev); + delete_memory_double_op()(deeq_dev); + delete_memory_complex_double_op()(ps_dev); + delete_memory_complex_double_op()(becp_dev); } TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_spin_op_gpu) @@ -157,12 +157,12 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_spin_op_gpu) sum = 0; iat = 0; std::complex* ps_dev = NULL, * becp_dev = NULL, * deeq_dev = NULL; std::vector> ps(expected_ps.size(), std::complex(0.0, 0.0)); - resize_memory_complex_double_op()(gpu_ctx, deeq_dev, deeq_spin.size()); - resize_memory_complex_double_op()(gpu_ctx, ps_dev, ps.size()); - resize_memory_complex_double_op()(gpu_ctx, becp_dev, becp_spin.size()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, deeq_dev, deeq_spin.data(), deeq_spin.size()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, ps_dev, ps.data(), ps.size()); - syncmem_cd_h2d_op()(gpu_ctx, cpu_ctx, becp_dev, becp_spin.data(), becp_spin.size()); + resize_memory_complex_double_op()(deeq_dev, deeq_spin.size()); + resize_memory_complex_double_op()(ps_dev, ps.size()); + resize_memory_complex_double_op()(becp_dev, becp_spin.size()); + syncmem_cd_h2d_op()(deeq_dev, deeq_spin.data(), deeq_spin.size()); + syncmem_cd_h2d_op()(ps_dev, ps.data(), ps.size()); + syncmem_cd_h2d_op()(becp_dev, becp_spin.data(), becp_spin.size()); nonlocal_gpu_op()( gpu_ctx, l1, l2_spin, l3, @@ -171,14 +171,14 @@ TEST_F(TestModuleHamiltNonlocal, nonlocal_pw_spin_op_gpu) deeq_dev, ps_dev, becp_dev); - syncmem_cd_d2h_op()(cpu_ctx, gpu_ctx, ps.data(), ps_dev, ps.size()); + syncmem_cd_d2h_op()(ps.data(), ps_dev, ps.size()); for (int ii = 0; ii < ps.size(); ii++) { EXPECT_LT(fabs(ps[ii] - expected_ps_spin[ii]), 5 * 1e-6); } EXPECT_EQ(sum, expected_sum); EXPECT_EQ(iat, expected_iat); - delete_memory_complex_double_op()(gpu_ctx, deeq_dev); - delete_memory_complex_double_op()(gpu_ctx, ps_dev); - delete_memory_complex_double_op()(gpu_ctx, becp_dev); + delete_memory_complex_double_op()(deeq_dev); + delete_memory_complex_double_op()(ps_dev); + delete_memory_complex_double_op()(becp_dev); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp index cbf434da0c..a3be95fce8 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_test.cpp @@ -137,24 +137,24 @@ TEST(TestSrcPWStressMultiDevice, cal_dbecp_noevc_nl_op_gpu) std::complex * d_vkb0i = nullptr, * d_vkb0j = nullptr, * d_vkb = nullptr, * d_vkb1 = nullptr, * d_vkb2 = nullptr, * d_dbecp_noevc = nullptr; double * d_gcar = nullptr, * d_kvec_c = nullptr; - resmem_zd_op()(gpu_ctx, d_vkb0i, vkb0i.size()); - resmem_zd_op()(gpu_ctx, d_vkb0j, vkb0j.size()); - resmem_zd_op()(gpu_ctx, d_vkb, vkb.size()); - resmem_zd_op()(gpu_ctx, d_vkb1, vkb1.size()); - resmem_zd_op()(gpu_ctx, d_vkb2, vkb2.size()); - resmem_zd_op()(gpu_ctx, d_dbecp_noevc, dbecp_noevc.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb0i, vkb0i.data(), vkb0i.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb0j, vkb0j.data(), vkb0j.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb, vkb.data(), vkb.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb1, vkb1.data(), vkb1.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_vkb2, vkb2.data(), vkb2.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_dbecp_noevc, dbecp_noevc.data(), dbecp_noevc.size()); - - resmem_dd_op()(gpu_ctx, d_gcar, gcar.size()); - resmem_dd_op()(gpu_ctx, d_kvec_c, kvec_c.size()); - - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_gcar, gcar.data(), gcar.size()); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_kvec_c, kvec_c.data(), kvec_c.size()); + resmem_zd_op()(d_vkb0i, vkb0i.size()); + resmem_zd_op()(d_vkb0j, vkb0j.size()); + resmem_zd_op()(d_vkb, vkb.size()); + resmem_zd_op()(d_vkb1, vkb1.size()); + resmem_zd_op()(d_vkb2, vkb2.size()); + resmem_zd_op()(d_dbecp_noevc, dbecp_noevc.size()); + syncmem_z2z_h2d_op()(d_vkb0i, vkb0i.data(), vkb0i.size()); + syncmem_z2z_h2d_op()(d_vkb0j, vkb0j.data(), vkb0j.size()); + syncmem_z2z_h2d_op()(d_vkb, vkb.data(), vkb.size()); + syncmem_z2z_h2d_op()(d_vkb1, vkb1.data(), vkb1.size()); + syncmem_z2z_h2d_op()(d_vkb2, vkb2.data(), vkb2.size()); + syncmem_z2z_h2d_op()(d_dbecp_noevc, dbecp_noevc.data(), dbecp_noevc.size()); + + resmem_dd_op()(d_gcar, gcar.size()); + resmem_dd_op()(d_kvec_c, kvec_c.size()); + + syncmem_d2d_h2d_op()(d_gcar, gcar.data(), gcar.size()); + syncmem_d2d_h2d_op()(d_kvec_c, kvec_c.data(), kvec_c.size()); hamilt::cal_dbecp_noevc_nl_op()(gpu_ctx, ipol, @@ -173,21 +173,21 @@ TEST(TestSrcPWStressMultiDevice, cal_dbecp_noevc_nl_op_gpu) d_vkb2, d_dbecp_noevc); - syncmem_z2z_d2h_op()(cpu_ctx, gpu_ctx, dbecp_noevc.data(), d_dbecp_noevc, dbecp_noevc.size()); + syncmem_z2z_d2h_op()(dbecp_noevc.data(), d_dbecp_noevc, dbecp_noevc.size()); for (int ii = 0; ii < dbecp_noevc.size(); ii++) { EXPECT_LT(fabs(dbecp_noevc[ii] - expected_dbecpnoevc[ii]), 6e-5); } - delmem_zd_op()(gpu_ctx, d_vkb0i); - delmem_zd_op()(gpu_ctx, d_vkb0j); - delmem_zd_op()(gpu_ctx, d_vkb); - delmem_zd_op()(gpu_ctx, d_vkb1); - delmem_zd_op()(gpu_ctx, d_vkb2); - delmem_zd_op()(gpu_ctx, d_dbecp_noevc); + delmem_zd_op()(d_vkb0i); + delmem_zd_op()(d_vkb0j); + delmem_zd_op()(d_vkb); + delmem_zd_op()(d_vkb1); + delmem_zd_op()(d_vkb2); + delmem_zd_op()(d_dbecp_noevc); - delmem_dd_op()(gpu_ctx, d_gcar); - delmem_dd_op()(gpu_ctx, d_kvec_c); + delmem_dd_op()(d_gcar); + delmem_dd_op()(d_kvec_c); } TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu) @@ -236,31 +236,31 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu) double * d_wg = nullptr, * d_deeq = nullptr, * d_stress = nullptr; double * d_ekb = nullptr, * d_qq_nt = nullptr; int * d_atom_nh = nullptr, * d_atom_na = nullptr; - resmem_zd_op()(gpu_ctx, d_becp, becp.size()); - resmem_zd_op()(gpu_ctx, d_dbecp, dbecp.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_becp, becp.data(), becp.size()); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, d_dbecp, dbecp.data(), dbecp.size()); - - resmem_dd_op()(gpu_ctx, d_wg, wg.size()); - resmem_dd_op()(gpu_ctx, d_deeq, deeq.size()); - resmem_dd_op()(gpu_ctx, d_stress, stress.size()); - resmem_dd_op()(gpu_ctx, d_ekb, ekb.size()); - resmem_dd_op()(gpu_ctx, d_qq_nt, qq_nt.size()); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_wg, wg.data(), wg.size()); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_deeq, deeq.data(), deeq.size()); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_stress, stress.data(), stress.size()); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_ekb, ekb.data(), ekb.size()); - syncmem_d2d_h2d_op()(gpu_ctx, cpu_ctx, d_qq_nt, qq_nt.data(), qq_nt.size()); + resmem_zd_op()(d_becp, becp.size()); + resmem_zd_op()(d_dbecp, dbecp.size()); + syncmem_z2z_h2d_op()(d_becp, becp.data(), becp.size()); + syncmem_z2z_h2d_op()(d_dbecp, dbecp.data(), dbecp.size()); + + resmem_dd_op()(d_wg, wg.size()); + resmem_dd_op()(d_deeq, deeq.size()); + resmem_dd_op()(d_stress, stress.size()); + resmem_dd_op()(d_ekb, ekb.size()); + resmem_dd_op()(d_qq_nt, qq_nt.size()); + syncmem_d2d_h2d_op()(d_wg, wg.data(), wg.size()); + syncmem_d2d_h2d_op()(d_deeq, deeq.data(), deeq.size()); + syncmem_d2d_h2d_op()(d_stress, stress.data(), stress.size()); + syncmem_d2d_h2d_op()(d_ekb, ekb.data(), ekb.size()); + syncmem_d2d_h2d_op()(d_qq_nt, qq_nt.data(), qq_nt.size()); using delmem_int_op = base_device::memory::delete_memory_op; using resmem_int_op = base_device::memory::resize_memory_op; using syncmem_int_h2d_op = base_device::memory::synchronize_memory_op; - resmem_int_op()(gpu_ctx, d_atom_nh, atom_nh.size()); - resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nh, atom_nh.data(), atom_nh.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size()); + resmem_int_op()(d_atom_nh, atom_nh.size()); + resmem_int_op()(d_atom_na, atom_na.size()); + syncmem_int_h2d_op()(d_atom_nh, atom_nh.data(), atom_nh.size()); + syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size()); hamilt::cal_stress_nl_op()(gpu_ctx, multi_proj, @@ -284,22 +284,22 @@ TEST(TestSrcPWStressMultiDevice, cal_stress_nl_op_gpu) d_dbecp, d_stress); - syncmem_d2d_d2h_op()(cpu_ctx, gpu_ctx, stress.data(), d_stress, stress.size()); + syncmem_d2d_d2h_op()(stress.data(), d_stress, stress.size()); for (int ii = 0; ii < stress.size(); ii++) { EXPECT_LT(fabs(stress[ii] - expected_stress[ii]), 6e-5); } - delmem_zd_op()(gpu_ctx, d_becp); - delmem_zd_op()(gpu_ctx, d_dbecp); + delmem_zd_op()(d_becp); + delmem_zd_op()(d_dbecp); - delmem_dd_op()(gpu_ctx, d_wg); - delmem_dd_op()(gpu_ctx, d_deeq); - delmem_dd_op()(gpu_ctx, d_stress); - delmem_dd_op()(gpu_ctx, d_ekb); - delmem_dd_op()(gpu_ctx, d_qq_nt); + delmem_dd_op()(d_wg); + delmem_dd_op()(d_deeq); + delmem_dd_op()(d_stress); + delmem_dd_op()(d_ekb); + delmem_dd_op()(d_qq_nt); - delmem_int_op()(gpu_ctx, d_atom_nh); - delmem_int_op()(gpu_ctx, d_atom_na); + delmem_int_op()(d_atom_nh); + delmem_int_op()(d_atom_na); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp index 318646f063..56c96157fd 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/veff_op_test.cpp @@ -89,19 +89,19 @@ TEST_F(TestModuleHamiltVeff, veff_pw_op_gpu) std::vector> res = out; double* d_in = NULL; std::complex* d_res = NULL; - resize_memory_double_op()(gpu_ctx, d_in, in.size()); - resize_memory_complex_op()(gpu_ctx, d_res, res.size()); - syncmem_double_h2d_op()(gpu_ctx, cpu_ctx, d_in, in.data(), in.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); + resize_memory_double_op()(d_in, in.size()); + resize_memory_complex_op()(d_res, res.size()); + syncmem_double_h2d_op()(d_in, in.data(), in.size()); + syncmem_complex_h2d_op()(d_res, res.data(), res.size()); veff_gpu_op()(gpu_ctx, this->size, d_res, d_in); - syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); + syncmem_complex_d2h_op()(res.data(), d_res, res.size()); for (int ii = 0; ii < res.size(); ii++) { EXPECT_LT(fabs(res[ii] - expected_out[ii]), 6e-5); } - delete_memory_double_op()(gpu_ctx, d_in); - delete_memory_complex_op()(gpu_ctx, d_res); + delete_memory_double_op()(d_in); + delete_memory_complex_op()(d_res); } TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu) @@ -112,12 +112,12 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu) std::vector> res1 = out1_spin; double* d_in = NULL; std::complex* d_res = NULL, * d_res1 = NULL; - resize_memory_double_op()(gpu_ctx, d_in, in_spin.size()); - resize_memory_complex_op()(gpu_ctx, d_res, res.size()); - resize_memory_complex_op()(gpu_ctx, d_res1, res1.size()); - syncmem_double_h2d_op()(gpu_ctx, cpu_ctx, d_in, in_spin.data(), in_spin.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res, res.data(), res.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_res1, res1.data(), res1.size()); + resize_memory_double_op()(d_in, in_spin.size()); + resize_memory_complex_op()(d_res, res.size()); + resize_memory_complex_op()(d_res1, res1.size()); + syncmem_double_h2d_op()(d_in, in_spin.data(), in_spin.size()); + syncmem_complex_h2d_op()(d_res, res.data(), res.size()); + syncmem_complex_h2d_op()(d_res1, res1.data(), res1.size()); const double * in_[4]; for (int ii = 0; ii < 4; ii++) { @@ -126,14 +126,14 @@ TEST_F(TestModuleHamiltVeff, veff_pw_spin_op_gpu) veff_gpu_op()(gpu_ctx, this->size, d_res, d_res1, in_); - syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res.data(), d_res, res.size()); - syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, res1.data(), d_res1, res1.size()); + syncmem_complex_d2h_op()(res.data(), d_res, res.size()); + syncmem_complex_d2h_op()(res1.data(), d_res1, res1.size()); for (int ii = 0; ii < res.size(); ii++) { EXPECT_LT(fabs(res[ii] - expected_out_spin[ii]), 7.5e-5); EXPECT_LT(fabs(res1[ii] - expected_out1_spin[ii]), 6e-5); } - delete_memory_double_op()(gpu_ctx, d_in); - delete_memory_complex_op()(gpu_ctx, d_res); - delete_memory_complex_op()(gpu_ctx, d_res1); + delete_memory_double_op()(d_in); + delete_memory_complex_op()(d_res); + delete_memory_complex_op()(d_res1); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp index 428304c52d..be5e6a8a68 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/vnl_op_test.cpp @@ -4049,34 +4049,34 @@ TEST_F(TestSrcPWVnlMultiDevice, cal_vnl_op_gpu) *d_tab = nullptr, *d_vkb1 = nullptr; std::complex*d_sk = nullptr, *d_vkb = nullptr; - resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size()); - resmem_int_op()(gpu_ctx, d_atom_nb, atom_nb.size()); - resmem_int_op()(gpu_ctx, d_atom_nh, atom_nh.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nb, atom_nb.data(), atom_nb.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_nh, atom_nh.data(), atom_nh.size()); + resmem_int_op()(d_atom_na, atom_na.size()); + resmem_int_op()(d_atom_nb, atom_nb.size()); + resmem_int_op()(d_atom_nh, atom_nh.size()); + syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size()); + syncmem_int_h2d_op()(d_atom_nb, atom_nb.data(), atom_nb.size()); + syncmem_int_h2d_op()(d_atom_nh, atom_nh.data(), atom_nh.size()); - resmem_var_op()(gpu_ctx, d_gk, gk.size()); - resmem_var_op()(gpu_ctx, d_ylm, ylm.size()); - resmem_var_op()(gpu_ctx, d_indv, indv.size()); - resmem_var_op()(gpu_ctx, d_nhtol, nhtol.size()); - resmem_var_op()(gpu_ctx, d_nhtolm, nhtolm.size()); - resmem_var_op()(gpu_ctx, d_tab, tab.size()); - resmem_var_op()(gpu_ctx, d_vkb1, vkb1.size()); + resmem_var_op()(d_gk, gk.size()); + resmem_var_op()(d_ylm, ylm.size()); + resmem_var_op()(d_indv, indv.size()); + resmem_var_op()(d_nhtol, nhtol.size()); + resmem_var_op()(d_nhtolm, nhtolm.size()); + resmem_var_op()(d_tab, tab.size()); + resmem_var_op()(d_vkb1, vkb1.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_gk, gk.data(), gk.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_ylm, ylm.data(), ylm.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_indv, indv.data(), indv.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_nhtol, nhtol.data(), nhtol.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_nhtolm, nhtolm.data(), nhtolm.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_tab, tab.data(), tab.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_vkb1, vkb1.data(), vkb1.size()); + syncmem_var_h2d_op()(d_gk, gk.data(), gk.size()); + syncmem_var_h2d_op()(d_ylm, ylm.data(), ylm.size()); + syncmem_var_h2d_op()(d_indv, indv.data(), indv.size()); + syncmem_var_h2d_op()(d_nhtol, nhtol.data(), nhtol.size()); + syncmem_var_h2d_op()(d_nhtolm, nhtolm.data(), nhtolm.size()); + syncmem_var_h2d_op()(d_tab, tab.data(), tab.size()); + syncmem_var_h2d_op()(d_vkb1, vkb1.data(), vkb1.size()); - resmem_complex_op()(gpu_ctx, d_sk, sk.size()); - resmem_complex_op()(gpu_ctx, d_vkb, vkb.size()); + resmem_complex_op()(d_sk, sk.size()); + resmem_complex_op()(d_vkb, vkb.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_sk, sk.data(), sk.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_vkb, vkb.data(), vkb.size()); + syncmem_complex_h2d_op()(d_sk, sk.data(), sk.size()); + syncmem_complex_h2d_op()(d_vkb, vkb.data(), vkb.size()); hamilt::cal_vnl_op()(gpu_ctx, ntype, @@ -4101,26 +4101,26 @@ TEST_F(TestSrcPWVnlMultiDevice, cal_vnl_op_gpu) d_sk, d_vkb); - syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, vkb.data(), d_vkb, vkb.size()); + syncmem_complex_d2h_op()(vkb.data(), d_vkb, vkb.size()); for (int ii = 0; ii < vkb.size(); ii++) { EXPECT_LT(fabs(vkb[ii] - expected_vkb[ii]), 6e-5); } - delmem_int_op()(gpu_ctx, d_atom_na); - delmem_int_op()(gpu_ctx, d_atom_nh); - delmem_int_op()(gpu_ctx, d_atom_nb); + delmem_int_op()(d_atom_na); + delmem_int_op()(d_atom_nh); + delmem_int_op()(d_atom_nb); - delmem_var_op()(gpu_ctx, d_gk); - delmem_var_op()(gpu_ctx, d_ylm); - delmem_var_op()(gpu_ctx, d_indv); - delmem_var_op()(gpu_ctx, d_nhtol); - delmem_var_op()(gpu_ctx, d_nhtolm); - delmem_var_op()(gpu_ctx, d_tab); - delmem_var_op()(gpu_ctx, d_vkb1); + delmem_var_op()(d_gk); + delmem_var_op()(d_ylm); + delmem_var_op()(d_indv); + delmem_var_op()(d_nhtol); + delmem_var_op()(d_nhtolm); + delmem_var_op()(d_tab); + delmem_var_op()(d_vkb1); - delmem_complex_op()(gpu_ctx, d_sk); - delmem_complex_op()(gpu_ctx, d_vkb); + delmem_complex_op()(d_sk); + delmem_complex_op()(d_vkb); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp index 8b46679d67..2463234c31 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/wf_op_test.cpp @@ -412,26 +412,26 @@ TEST_F(TestSrcPWWfMultiDevice, cal_sk_op_gpu) double * d_kvec_c = nullptr, * d_atom_tau = nullptr; std::complex * d_sk = nullptr, * d_eigts1 = nullptr, * d_eigts2 = nullptr, * d_eigts3 = nullptr; - resmem_int_op()(gpu_ctx, d_atom_na, atom_na.size()); - resmem_int_op()(gpu_ctx, d_igl2isz, igl2isz.size()); - resmem_int_op()(gpu_ctx, d_is2fftixy, is2fftixy.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_atom_na, atom_na.data(), atom_na.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_igl2isz, igl2isz.data(), igl2isz.size()); - syncmem_int_h2d_op()(gpu_ctx, cpu_ctx, d_is2fftixy, is2fftixy.data(), is2fftixy.size()); + resmem_int_op()(d_atom_na, atom_na.size()); + resmem_int_op()(d_igl2isz, igl2isz.size()); + resmem_int_op()(d_is2fftixy, is2fftixy.size()); + syncmem_int_h2d_op()(d_atom_na, atom_na.data(), atom_na.size()); + syncmem_int_h2d_op()(d_igl2isz, igl2isz.data(), igl2isz.size()); + syncmem_int_h2d_op()(d_is2fftixy, is2fftixy.data(), is2fftixy.size()); - resmem_var_op()(gpu_ctx, d_kvec_c, kvec_c.size()); - resmem_var_op()(gpu_ctx, d_atom_tau, atom_tau.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_kvec_c, kvec_c.data(), kvec_c.size()); - syncmem_var_h2d_op()(gpu_ctx, cpu_ctx, d_atom_tau, atom_tau.data(), atom_tau.size()); + resmem_var_op()(d_kvec_c, kvec_c.size()); + resmem_var_op()(d_atom_tau, atom_tau.size()); + syncmem_var_h2d_op()(d_kvec_c, kvec_c.data(), kvec_c.size()); + syncmem_var_h2d_op()(d_atom_tau, atom_tau.data(), atom_tau.size()); - resmem_complex_op()(gpu_ctx, d_sk, sk.size()); - resmem_complex_op()(gpu_ctx, d_eigts1, eigts1.size()); - resmem_complex_op()(gpu_ctx, d_eigts2, eigts2.size()); - resmem_complex_op()(gpu_ctx, d_eigts3, eigts3.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_sk, sk.data(), sk.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_eigts1, eigts1.data(), eigts1.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_eigts2, eigts2.data(), eigts2.size()); - syncmem_complex_h2d_op()(gpu_ctx, cpu_ctx, d_eigts3, eigts3.data(), eigts3.size()); + resmem_complex_op()(d_sk, sk.size()); + resmem_complex_op()(d_eigts1, eigts1.size()); + resmem_complex_op()(d_eigts2, eigts2.size()); + resmem_complex_op()(d_eigts3, eigts3.size()); + syncmem_complex_h2d_op()(d_sk, sk.data(), sk.size()); + syncmem_complex_h2d_op()(d_eigts1, eigts1.data(), eigts1.size()); + syncmem_complex_h2d_op()(d_eigts2, eigts2.data(), eigts2.size()); + syncmem_complex_h2d_op()(d_eigts3, eigts3.data(), eigts3.size()); hamilt::cal_sk_op()(gpu_ctx, ik, @@ -459,22 +459,22 @@ TEST_F(TestSrcPWWfMultiDevice, cal_sk_op_gpu) d_eigts3, d_sk); - syncmem_complex_d2h_op()(cpu_ctx, gpu_ctx, sk.data(), d_sk, sk.size()); + syncmem_complex_d2h_op()(sk.data(), d_sk, sk.size()); for (int ii = 0; ii < sk.size(); ii++) { EXPECT_LT(fabs(sk[ii] - expected_sk[ii]), 6e-5); } - delmem_int_op()(gpu_ctx, d_atom_na); - delmem_int_op()(gpu_ctx, d_igl2isz); - delmem_int_op()(gpu_ctx, d_is2fftixy); + delmem_int_op()(d_atom_na); + delmem_int_op()(d_igl2isz); + delmem_int_op()(d_is2fftixy); - delmem_var_op()(gpu_ctx, d_kvec_c); - delmem_var_op()(gpu_ctx, d_atom_tau); + delmem_var_op()(d_kvec_c); + delmem_var_op()(d_atom_tau); - delmem_complex_op()(gpu_ctx, d_sk); - delmem_complex_op()(gpu_ctx, d_eigts1); - delmem_complex_op()(gpu_ctx, d_eigts2); - delmem_complex_op()(gpu_ctx, d_eigts3); + delmem_complex_op()(d_sk); + delmem_complex_op()(d_eigts1); + delmem_complex_op()(d_eigts2); + delmem_complex_op()(d_eigts3); } #endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp index aa28b5abe2..79649fab07 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp +++ b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp @@ -164,7 +164,7 @@ void Nonlocal_maths::cal_ylm(int lmax, int npw, const FPTYPE* q, // calculate ModuleBase::YlmReal::Ylm_Real(cpu_ctx, ntot_ylm, npw, q, ylm_cpu.data()); // send from cpu to gpu - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, ylm, ylm_cpu.data(), ylm_cpu.size()); + syncmem_var_h2d_op()(ylm, ylm_cpu.data(), ylm_cpu.size()); } else { @@ -193,7 +193,7 @@ void Nonlocal_maths::cal_ylm_deri(int lmax, int npw, const FPTYP Nonlocal_maths::dylmr2(ntot_ylm, npw, q, &dylmdq_cpu[ipol * ntot_ylm * npw], ipol); } // send from cpu to gpu - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, out, dylmdq_cpu.data(), dylmdq_cpu.size()); + syncmem_var_h2d_op()(out, dylmdq_cpu.data(), dylmdq_cpu.size()); } else { diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp index d4b7e51b65..e15793cbdc 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp @@ -192,29 +192,29 @@ void Onsite_Proj_tools::allocate_memory(const ModuleBase::matrix // allocate the memory for vkb and vkb_deri. if (this->device == base_device::GpuDevice) { - resmem_int_op()(this->ctx, this->d_dvkb_indexes, max_nh * 4); + resmem_int_op()(this->d_dvkb_indexes, max_nh * 4); } - resmem_var_op()(this->ctx, this->hd_vq, nprojmax * max_npw); - resmem_var_op()(this->ctx, this->hd_vq_deri, nprojmax * max_npw); - resmem_var_op()(this->ctx, this->hd_ylm, (lprojmax + 1) * (lprojmax + 1) * max_npw); - resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (lprojmax + 1) * (lprojmax + 1) * max_npw); + resmem_var_op()(this->hd_vq, nprojmax * max_npw); + resmem_var_op()(this->hd_vq_deri, nprojmax * max_npw); + resmem_var_op()(this->hd_ylm, (lprojmax + 1) * (lprojmax + 1) * max_npw); + resmem_var_op()(this->hd_ylm_deri, 3 * (lprojmax + 1) * (lprojmax + 1) * max_npw); if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc); - resmem_var_op()(this->ctx, d_ekb, ekb.nr * ekb.nc); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, ekb.c, ekb.nr * ekb.nc); - resmem_int_op()(this->ctx, atom_nh, this->ntype); - resmem_int_op()(this->ctx, atom_na, this->ntype); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_na, h_atom_na.data(), this->ntype); + resmem_var_op()(d_wg, wg.nr * wg.nc); + resmem_var_op()(d_ekb, ekb.nr * ekb.nc); + syncmem_var_h2d_op()(d_wg, wg.c, wg.nr * wg.nc); + syncmem_var_h2d_op()(d_ekb, ekb.c, ekb.nr * ekb.nc); + resmem_int_op()(atom_nh, this->ntype); + resmem_int_op()(atom_na, this->ntype); + syncmem_int_h2d_op()(atom_nh, h_atom_nh.data(), this->ntype); + syncmem_int_h2d_op()(atom_na, h_atom_na.data(), this->ntype); - resmem_var_op()(this->ctx, d_g_plus_k, max_npw * 5); - resmem_var_op()(this->ctx, d_pref, max_nh); - resmem_var_op()(this->ctx, d_vq_tab, this->tabtpr->getSize()); - resmem_complex_op()(this->ctx, d_pref_in, max_nh); + resmem_var_op()(d_g_plus_k, max_npw * 5); + resmem_var_op()(d_pref, max_nh); + resmem_var_op()(d_vq_tab, this->tabtpr->getSize()); + resmem_complex_op()(d_pref_in, max_nh); } else { @@ -230,39 +230,39 @@ void Onsite_Proj_tools::delete_memory() { // delete memory - delmem_var_op()(this->ctx, hd_vq); - delmem_var_op()(this->ctx, hd_vq_deri); - delmem_var_op()(this->ctx, hd_ylm); - delmem_var_op()(this->ctx, hd_ylm_deri); + delmem_var_op()(hd_vq); + delmem_var_op()(hd_vq_deri); + delmem_var_op()(hd_ylm); + delmem_var_op()(hd_ylm_deri); // delete memory on GPU if (this->device == base_device::GpuDevice) { - delmem_var_op()(this->ctx, d_wg); - delmem_var_op()(this->ctx, d_ekb); - delmem_int_op()(this->ctx, atom_nh); - delmem_int_op()(this->ctx, atom_na); - delmem_var_op()(this->ctx, d_g_plus_k); - delmem_var_op()(this->ctx, d_pref); - delmem_var_op()(this->ctx, d_vq_tab); - delmem_complex_op()(this->ctx, this->d_pref_in); - delmem_int_op()(this->ctx, d_dvkb_indexes); + delmem_var_op()(d_wg); + delmem_var_op()(d_ekb); + delmem_int_op()(atom_nh); + delmem_int_op()(atom_na); + delmem_var_op()(d_g_plus_k); + delmem_var_op()(d_pref); + delmem_var_op()(d_vq_tab); + delmem_complex_op()(this->d_pref_in); + delmem_int_op()(d_dvkb_indexes); } if (becp != nullptr) { - delmem_complex_op()(this->ctx, becp); - delmem_complex_op()(this->ctx, hd_sk); + delmem_complex_op()(becp); + delmem_complex_op()(hd_sk); } if (dbecp != nullptr) { - delmem_complex_op()(this->ctx, dbecp); + delmem_complex_op()(dbecp); } if (this->pre_ik_f != -1) { - delmem_int_op()(this->ctx, gcar_zero_indexes); - delmem_complex_op()(this->ctx, vkb_save); - delmem_var_op()(this->ctx, gcar); + delmem_int_op()(gcar_zero_indexes); + delmem_complex_op()(vkb_save); + delmem_var_op()(gcar); } } @@ -288,7 +288,7 @@ void Onsite_Proj_tools::cal_becp(int ik, const int npw = this->wfc_basis_->npwk[ik]; if (becp_in == nullptr && this->becp == nullptr) { - resmem_complex_op()(this->ctx, becp, this->nbands * npol * this->nkb); + resmem_complex_op()(becp, this->nbands * npol * this->nkb); } std::complex* becp_tmp = becp_in == nullptr ? this->becp : becp_in; const int size_becp_act = npm * npol * this->nkb; @@ -297,7 +297,7 @@ void Onsite_Proj_tools::cal_becp(int ik, const int size_becp = this->nbands * npol * this->nkb; if (this->becp == nullptr) { - resmem_complex_op()(this->ctx, becp, size_becp); + resmem_complex_op()(becp, size_becp); } // prepare math tools @@ -311,7 +311,7 @@ void Onsite_Proj_tools::cal_becp(int ik, // vq_tb has dimension (ntype, nproj, GlobalV::NQX) // calculate sk - resmem_complex_op()(ctx, hd_sk, this->ucell_->nat * npw); + resmem_complex_op()(hd_sk, this->ucell_->nat * npw); this->sf_->get_sk(ctx, ik, this->wfc_basis_, hd_sk); std::complex* d_sk = this->hd_sk; // prepare ylm,size: (lmax+1)^2 * this->max_npw @@ -347,8 +347,8 @@ void Onsite_Proj_tools::cal_becp(int ik, if (this->device == base_device::GpuDevice) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_g_plus_k, g_plus_k.data(), g_plus_k.size()); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vq_tab, this->tabtpr->ptr, this->tabtpr->getSize()); + syncmem_var_h2d_op()(d_g_plus_k, g_plus_k.data(), g_plus_k.size()); + syncmem_var_h2d_op()(d_vq_tab, this->tabtpr->ptr, this->tabtpr->getSize()); gk = d_g_plus_k; vq_tb = d_vq_tab; } @@ -390,8 +390,8 @@ void Onsite_Proj_tools::cal_becp(int ik, if (this->device == base_device::GpuDevice) { - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh); + syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4); + syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh); } for (int ia = 0; ia < h_atom_na[it]; ia++) @@ -443,11 +443,11 @@ void Onsite_Proj_tools::cal_becp(int ik, if (this->device == base_device::GpuDevice) { std::complex* h_becp = nullptr; - resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act); - syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp_tmp, size_becp_act); + resmem_complex_h_op()(h_becp, size_becp_act); + syncmem_complex_d2h_op()(h_becp, becp_tmp, size_becp_act); Parallel_Reduce::reduce_pool(h_becp, size_becp_act); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp_tmp, h_becp, size_becp_act); - delmem_complex_h_op()(this->cpu_ctx, h_becp); + syncmem_complex_h2d_op()(becp_tmp, h_becp, size_becp_act); + delmem_complex_h_op()(h_becp); } else { @@ -474,7 +474,7 @@ void Onsite_Proj_tools::cal_dbecp_s(int ik, int npm, int ipol, i const int npm_npol = npm * npol; if (this->dbecp == nullptr) { - resmem_complex_op()(this->ctx, dbecp, size_becp); + resmem_complex_op()(dbecp, size_becp); } // prepare math tools @@ -540,8 +540,8 @@ void Onsite_Proj_tools::cal_dbecp_s(int ik, int npm, int ipol, i this->dvkb_indexes.data()); if (this->device == base_device::GpuDevice) { - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4); - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh); + syncmem_int_h2d_op()(d_dvkb_indexes, dvkb_indexes.data(), nh * 4); + syncmem_complex_h2d_op()(d_pref_in, pref.data(), nh); } for (int ia = 0; ia < h_atom_na[it]; ia++) { @@ -613,8 +613,8 @@ void Onsite_Proj_tools::cal_dbecp_f(int ik, int npm, int ipol) // calculate gcarx, gcary/gcarx and gcarz/gcary, overwrite gcar if (this->pre_ik_f == -1) // if it is the very first run, we allocate { - resmem_var_op()(this->ctx, gcar, 3 * this->wfc_basis_->npwk_max); - resmem_int_op()(this->ctx, gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max); + resmem_var_op()(gcar, 3 * this->wfc_basis_->npwk_max); + resmem_int_op()(gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max); } // first refresh the value of gcar_zero_indexes, gcar_zero_counts if (this->pre_ik_f != ik) @@ -647,7 +647,7 @@ void Onsite_Proj_tools::cal_dbecp_f(int ik, int npm, int ipol) const int size_becp = this->nbands * npol * this->nkb; if (this->dbecp == nullptr) // if it is the very first run, we allocate { // why not judging whether dbecp == nullptr inside resmem_complex_op? - resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); + resmem_complex_op()(dbecp, 3 * size_becp); } // do gemm to get dbecp and revert the ppcell_vkb for next ipol const std::complex* ppsi = &(this->psi_[0](ik, 0, 0)); @@ -799,10 +799,10 @@ void Onsite_Proj_tools::transfer_gcar(int npw, int npw_max, cons } // prepare the memory for vkb_save const int max_count = std::max(gcar_zero_counts[0], std::max(gcar_zero_counts[1], gcar_zero_counts[2])); - resmem_complex_op()(this->ctx, this->vkb_save, this->nkb * max_count); + resmem_complex_op()(this->vkb_save, this->nkb * max_count); // transfer the gcar and gcar_zero_indexes to the device - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar, gcar_tmp.data(), 3 * npw_max); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max); + syncmem_var_h2d_op()(gcar, gcar_tmp.data(), 3 * npw_max); + syncmem_int_h2d_op()(gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max); } template @@ -819,11 +819,11 @@ void Onsite_Proj_tools::cal_force_dftu(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype); - syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype); - resmem_complex_op()(this->ctx, vu_tmp, size_vu); - syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + resmem_int_op()(orbital_corr_tmp, this->ucell_->ntype); + syncmem_int_h2d_op()(orbital_corr_tmp, orbital_corr, this->ucell_->ntype); + resmem_complex_op()(vu_tmp, size_vu); + syncmem_complex_h2d_op()(vu_tmp, vu, size_vu); + syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1)); } else #endif @@ -853,8 +853,8 @@ void Onsite_Proj_tools::cal_force_dftu(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - delmem_complex_op()(this->ctx, vu_tmp); - delmem_int_op()(this->ctx, orbital_corr_tmp); + delmem_complex_op()(vu_tmp); + delmem_int_op()(orbital_corr_tmp); } #endif } @@ -877,9 +877,9 @@ void Onsite_Proj_tools::cal_force_dspin(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + resmem_var_op()(lambda_tmp, this->ucell_->nat * 3); + syncmem_var_h2d_op()(lambda_tmp, lambda_array.data(), this->ucell_->nat * 3); + syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1)); } else #endif @@ -908,7 +908,7 @@ void Onsite_Proj_tools::cal_force_dspin(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - delmem_var_op()(this->ctx, lambda_tmp); + delmem_var_op()(lambda_tmp); } #endif } @@ -927,11 +927,11 @@ void Onsite_Proj_tools::cal_stress_dftu(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype); - syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype); - resmem_complex_op()(this->ctx, vu_tmp, size_vu); - syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + resmem_int_op()(orbital_corr_tmp, this->ucell_->ntype); + syncmem_int_h2d_op()(orbital_corr_tmp, orbital_corr, this->ucell_->ntype); + resmem_complex_op()(vu_tmp, size_vu); + syncmem_complex_h2d_op()(vu_tmp, vu, size_vu); + syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1)); } else #endif @@ -957,8 +957,8 @@ void Onsite_Proj_tools::cal_stress_dftu(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - delmem_complex_op()(this->ctx, vu_tmp); - delmem_int_op()(this->ctx, orbital_corr_tmp); + delmem_complex_op()(vu_tmp); + delmem_int_op()(orbital_corr_tmp); } #endif } @@ -981,9 +981,9 @@ void Onsite_Proj_tools::cal_stress_dspin(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + resmem_var_op()(lambda_tmp, this->ucell_->nat * 3); + syncmem_var_h2d_op()(lambda_tmp, lambda_array.data(), this->ucell_->nat * 3); + syncmem_var_h2d_op()(d_wg, h_wg, this->nbands * (ik+1)); } else #endif @@ -1009,7 +1009,7 @@ void Onsite_Proj_tools::cal_stress_dspin(int ik, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - delmem_var_op()(this->ctx, lambda_tmp); + delmem_var_op()(lambda_tmp); } #endif } diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp index 2bb69dc131..f235df15e5 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp @@ -173,7 +173,7 @@ void projectors::OnsiteProjector::init(const std::string& orbital_dir this->tot_nproj = itiaiprojm2irow_.size(); this->npwx_ = this->pw_basis_->npwk_max; this->size_vproj = this->tot_nproj * this->npwx_; - resmem_complex_op()(this->ctx, this->tab_atomic_, this->size_vproj, "OnsiteP::tab_atomic_"); + resmem_complex_op()(this->tab_atomic_, this->size_vproj, "OnsiteP::tab_atomic_"); } delete this->fs_tools; // it is okay to delete nullptr @@ -191,12 +191,12 @@ projectors::OnsiteProjector::~OnsiteProjector() { //delete[] becp; delete fs_tools; - delmem_complex_op()(this->ctx, this->tab_atomic_); + delmem_complex_op()(this->tab_atomic_); if(this->device == base_device::GpuDevice) { - delmem_complex_h_op()(this->cpu_ctx, this->h_becp); + delmem_complex_h_op()(this->h_becp); } - delmem_complex_op()(this->ctx, this->becp); + delmem_complex_op()(this->becp); } @@ -390,10 +390,10 @@ void projectors::OnsiteProjector::overlap_proj_psi( if(this->becp == nullptr || this->size_becp < npm*this->tot_nproj) { this->size_becp = npm*this->tot_nproj; - resmem_complex_op()(this->ctx, this->becp, this->size_becp); + resmem_complex_op()(this->becp, this->size_becp); if(this->device == base_device::GpuDevice ) { - resmem_complex_h_op()(this->cpu_ctx, this->h_becp, this->size_becp); + resmem_complex_h_op()(this->h_becp, this->size_becp); } else { @@ -403,7 +403,7 @@ void projectors::OnsiteProjector::overlap_proj_psi( this->fs_tools->cal_becp(ik_, npm/npol, this->becp, ppsi); // in cal_becp, npm should be the one not multiplied by npol if(this->device == base_device::GpuDevice) { - syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, this->becp, this->size_becp); + syncmem_complex_d2h_op()(h_becp, this->becp, this->size_becp); } ModuleBase::timer::tick("OnsiteProj", "overlap"); } diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp index b0372109dc..dc8a566d05 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/meta_pw.cpp @@ -27,14 +27,14 @@ Meta>::Meta(Real tpiba_in, this->vk_row = vk_row; this->vk_col = vk_col; this->wfcpw = wfcpw_in; - resmem_complex_op()(this->ctx, this->porter, this->wfcpw->nmaxgr, "Meta::porter"); + resmem_complex_op()(this->porter, this->wfcpw->nmaxgr, "Meta::porter"); } template Meta>::~Meta() { - delmem_complex_op()(this->ctx, this->porter); + delmem_complex_op()(this->porter); } template @@ -55,7 +55,7 @@ void Meta>::act( ModuleBase::timer::tick("Operator", "MetaPW"); if(is_first_node) { - setmem_complex_op()(this->ctx, tmhpsi, 0, nbasis*nbands/npol); + setmem_complex_op()(tmhpsi, 0, nbasis*nbands/npol); } const int current_spin = this->isk[this->ik]; diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp index 563e9d23a0..7446151d36 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/nonlocal_pw.cpp @@ -35,8 +35,8 @@ Nonlocal>::Nonlocal(const int* isk_in, template Nonlocal>::~Nonlocal() { - delmem_complex_op()(this->ctx, this->ps); - delmem_complex_op()(this->ctx, this->becp); + delmem_complex_op()(this->ps); + delmem_complex_op()(this->becp); } template @@ -72,10 +72,10 @@ void Nonlocal>::add_nonlocal_pp(T *hpsi_in, const T *becp, // T *ps = new T[nkb * m]; // ModuleBase::GlobalFunc::ZEROS(ps, m * nkb); if (this->nkb_m < m * nkb) { - resmem_complex_op()(this->ctx, this->ps, nkb * m, "Nonlocal::ps"); + resmem_complex_op()(this->ps, nkb * m, "Nonlocal::ps"); this->nkb_m = m * nkb; } - setmem_complex_op()(this->ctx, this->ps, 0, nkb * m); + setmem_complex_op()(this->ps, 0, nkb * m); int sum = 0; int iat = 0; @@ -221,7 +221,7 @@ void Nonlocal>::act( ModuleBase::timer::tick("Operator", "NonlocalPW"); if(is_first_node) { - setmem_complex_op()(this->ctx, tmhpsi, 0, nbasis*nbands/npol); + setmem_complex_op()(tmhpsi, 0, nbasis*nbands/npol); } if(!PARAM.inp.use_paw) { @@ -235,7 +235,7 @@ void Nonlocal>::act( // qianrui optimize 2021-3-31 int nkb = this->ppcell->nkb; if (this->nkb_m < nbands * nkb) { - resmem_complex_op()(this->ctx, this->becp, nbands * nkb, "Nonlocal::becp"); + resmem_complex_op()(this->becp, nbands * nkb, "Nonlocal::becp"); } // ModuleBase::ComplexMatrix becp(nbands, nkb, false); char transa = 'C'; diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp index 39f0c1458a..3cfd345356 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp @@ -30,22 +30,22 @@ OnsiteProj>::OnsiteProj(const int* isk_in, template OnsiteProj>::~OnsiteProj() { - delmem_complex_op()(this->ctx, this->ps); + delmem_complex_op()(this->ps); if(this->init_delta_spin) { - delmem_int_op()(this->ctx, this->ip_iat); - delmem_complex_op()(this->ctx, this->lambda_coeff); + delmem_int_op()(this->ip_iat); + delmem_complex_op()(this->lambda_coeff); } if(this->has_dftu) { if(!init_delta_spin) { - delmem_int_op()(this->ctx, this->ip_iat); + delmem_int_op()(this->ip_iat); } - delmem_int_op()(this->ctx, this->orb_l_iat); - delmem_int_op()(this->ctx, this->ip_m); - delmem_int_op()(this->ctx, this->vu_begin_iat); - delmem_complex_op()(this->ctx, this->vu_device); + delmem_int_op()(this->orb_l_iat); + delmem_int_op()(this->ip_m); + delmem_int_op()(this->vu_begin_iat); + delmem_complex_op()(this->vu_device); } } @@ -127,17 +127,17 @@ void OnsiteProj>::cal_ps_delta_spin(const int npol, const // T *ps = new T[tnp * m]; // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp); if (this->nkb_m < m * tnp) { - resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj::ps"); + resmem_complex_op()(this->ps, tnp * m, "OnsiteProj::ps"); this->nkb_m = m * tnp; } - setmem_complex_op()(this->ctx, this->ps, 0, tnp * m); + setmem_complex_op()(this->ps, 0, tnp * m); if(!this->init_delta_spin) { this->init_delta_spin = true; //prepare ip_iat and lambda_coeff - resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj()); - resmem_complex_op()(this->ctx, this->lambda_coeff, this->ucell->nat * 4); + resmem_int_op()(this->ip_iat, onsite_p->get_tot_nproj()); + resmem_complex_op()(this->lambda_coeff, this->ucell->nat * 4); std::vector ip_iat0(onsite_p->get_tot_nproj()); int ip0 = 0; for(int iat=0;iatucell->nat;iat++) @@ -147,7 +147,7 @@ void OnsiteProj>::cal_ps_delta_spin(const int npol, const ip_iat0[ip0++] = iat; } } - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj()); + syncmem_int_h2d_op()(this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj()); } // prepare array of nh_iat and lambda_array to pass to the onsite_ps_op operator @@ -159,7 +159,7 @@ void OnsiteProj>::cal_ps_delta_spin(const int npol, const tmp_lambda_coeff[iat * 4 + 2] = std::complex(lambda[iat][0], -1 * lambda[iat][1]); tmp_lambda_coeff[iat * 4 + 3] = std::complex(-1 * lambda[iat][2], 0.0); } - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->lambda_coeff, tmp_lambda_coeff.data(), this->ucell->nat * 4); + syncmem_complex_h2d_op()(this->lambda_coeff, tmp_lambda_coeff.data(), this->ucell->nat * 4); // TODO: code block above should be moved to the init function hamilt::onsite_ps_op()( @@ -225,23 +225,23 @@ void OnsiteProj>::cal_ps_dftu(const int npol, const int m) // T *ps = new T[tnp * m]; // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp); if (this->nkb_m < m * tnp) { - resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj::ps"); + resmem_complex_op()(this->ps, tnp * m, "OnsiteProj::ps"); this->nkb_m = m * tnp; } if(!this->has_delta_spin) { - setmem_complex_op()(this->ctx, this->ps, 0, tnp * m); + setmem_complex_op()(this->ps, 0, tnp * m); } if(!this->init_dftu) { this->init_dftu = true; //prepare orb_l_iat, ip_m, vu_begin_iat and vu_device - resmem_int_op()(this->ctx, this->orb_l_iat, this->ucell->nat); - resmem_int_op()(this->ctx, this->ip_m, onsite_p->get_tot_nproj()); - resmem_int_op()(this->ctx, this->vu_begin_iat, this->ucell->nat); + resmem_int_op()(this->orb_l_iat, this->ucell->nat); + resmem_int_op()(this->ip_m, onsite_p->get_tot_nproj()); + resmem_int_op()(this->vu_begin_iat, this->ucell->nat); // recal the ip_iat - resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj()); + resmem_int_op()(this->ip_iat, onsite_p->get_tot_nproj()); std::vector ip_iat0(onsite_p->get_tot_nproj()); std::vector ip_m0(onsite_p->get_tot_nproj()); std::vector vu_begin_iat0(this->ucell->nat); @@ -285,15 +285,15 @@ void OnsiteProj>::cal_ps_dftu(const int npol, const int m) } } } - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj()); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj()); - syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat); + syncmem_int_h2d_op()(this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat); + syncmem_int_h2d_op()(this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj()); + syncmem_int_h2d_op()(this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj()); + syncmem_int_h2d_op()(this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat); - resmem_complex_op()(this->ctx, this->vu_device, dftu->get_size_eff_pot_pw()); + resmem_complex_op()(this->vu_device, dftu->get_size_eff_pot_pw()); } - syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->vu_device, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw()); + syncmem_complex_h2d_op()(this->vu_device, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw()); hamilt::onsite_ps_op()( this->ctx, // device context diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp index 2343ee7ecb..6bff6b2dc0 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/veff_pw.cpp @@ -23,16 +23,16 @@ Veff>::Veff(const int* isk_in, this->veff_row = veff_row; this->veff_col = veff_col; this->wfcpw = wfcpw_in; - resmem_complex_op()(this->ctx, this->porter, this->wfcpw->nmaxgr, "Veff::porter"); - resmem_complex_op()(this->ctx, this->porter1, this->wfcpw->nmaxgr, "Veff::porter1"); + resmem_complex_op()(this->porter, this->wfcpw->nmaxgr, "Veff::porter"); + resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr, "Veff::porter1"); } template Veff>::~Veff() { - delmem_complex_op()(this->ctx, this->porter); - delmem_complex_op()(this->ctx, this->porter1); + delmem_complex_op()(this->porter); + delmem_complex_op()(this->porter1); } template @@ -48,7 +48,7 @@ void Veff>::act( ModuleBase::timer::tick("Operator", "VeffPW"); if(is_first_node) { - setmem_complex_op()(this->ctx, tmhpsi, 0, nbasis*nbands/npol); + setmem_complex_op()(tmhpsi, 0, nbasis*nbands/npol); } int max_npw = nbasis / npol; @@ -124,8 +124,8 @@ hamilt::Veff>::Veff(const Veff this->veff_col = veff->get_veff_col(); this->veff_row = veff->get_veff_row(); this->wfcpw = veff->get_wfcpw(); - resmem_complex_op()(this->ctx, this->porter, this->wfcpw->nmaxgr); - resmem_complex_op()(this->ctx, this->porter1, this->wfcpw->nmaxgr); + resmem_complex_op()(this->porter, this->wfcpw->nmaxgr); + resmem_complex_op()(this->porter1, this->wfcpw->nmaxgr); this->veff = veff->get_veff(); if (this->isk == nullptr || this->veff == nullptr || this->wfcpw == nullptr) { ModuleBase::WARNING_QUIT("VeffPW", "Constuctor of Operator::VeffPW is failed, please check your code!"); diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp index ab8d9b3fa1..bbdefb737a 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_cc.cpp @@ -289,35 +289,35 @@ void Stress_Func::deriv_drhoc double *aux_d = nullptr; double *drhocg_d = nullptr; if(this->device == base_device::GpuDevice ) { - resmem_var_op()(this->ctx, r_d, mesh); - resmem_var_op()(this->ctx, rhoc_d, mesh); - resmem_var_op()(this->ctx, rab_d, mesh); - - resmem_var_op()(this->ctx, aux_d, mesh); - resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg); - resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg); - - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gx_arr_d, gx_arr.data(), rho_basis->ngg); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, mesh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, mesh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, rhoc, mesh); + resmem_var_op()(r_d, mesh); + resmem_var_op()(rhoc_d, mesh); + resmem_var_op()(rab_d, mesh); + + resmem_var_op()(aux_d, mesh); + resmem_var_op()(gx_arr_d, rho_basis->ngg); + resmem_var_op()(drhocg_d, rho_basis->ngg); + + syncmem_var_h2d_op()(gx_arr_d, gx_arr.data(), rho_basis->ngg); + syncmem_var_h2d_op()(r_d, r, mesh); + syncmem_var_h2d_op()(rab_d, rab, mesh); + syncmem_var_h2d_op()(rhoc_d, rhoc, mesh); } if(this->device == base_device::GpuDevice) { hamilt::cal_stress_drhoc_aux_op()( r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,mesh,igl0,rho_basis->ngg-igl0,omega,type); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); + syncmem_var_d2h_op()(drhocg+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); } else { hamilt::cal_stress_drhoc_aux_op()( r,rhoc,gx_arr.data()+igl0,rab,drhocg+igl0,mesh,igl0,rho_basis->ngg-igl0,omega,type); } - delmem_var_op()(this->ctx, r_d); - delmem_var_op()(this->ctx, rhoc_d); - delmem_var_op()(this->ctx, rab_d); - delmem_var_op()(this->ctx, gx_arr_d); - delmem_var_op()(this->ctx, drhocg_d); + delmem_var_op()(r_d); + delmem_var_op()(rhoc_d); + delmem_var_op()(rab_d); + delmem_var_op()(gx_arr_d); + delmem_var_op()(drhocg_d); return; } diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp index 740d692c39..42e619c9bc 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_loc.cpp @@ -244,22 +244,20 @@ const UnitCell& ucell_in double *aux_d = nullptr; double *drhocg_d = nullptr; if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, r_d, msh); - resmem_var_op()(this->ctx, rhoc_d, msh); - resmem_var_op()(this->ctx, rab_d, msh); + resmem_var_op()(r_d, msh); + resmem_var_op()(rhoc_d, msh); + resmem_var_op()(rab_d, msh); - resmem_var_op()(this->ctx, aux_d, msh); - resmem_var_op()(this->ctx, gx_arr_d, rho_basis->ngg+1); - resmem_var_op()(this->ctx, drhocg_d, rho_basis->ngg); + resmem_var_op()(aux_d, msh); + resmem_var_op()(gx_arr_d, rho_basis->ngg+1); + resmem_var_op()(drhocg_d, rho_basis->ngg); - syncmem_var_h2d_op()(this->ctx, - this->cpu_ctx, - gx_arr_d, + syncmem_var_h2d_op()(gx_arr_d, gx_arr.data(), rho_basis->ngg+1); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, r_d, r, msh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rab_d, rab, msh); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, rhoc_d, aux.data(), msh); + syncmem_var_h2d_op()(r_d, r, msh); + syncmem_var_h2d_op()(rab_d, rab, msh); + syncmem_var_h2d_op()(rhoc_d, aux.data(), msh); } @@ -267,7 +265,7 @@ const UnitCell& ucell_in if(this->device == base_device::GpuDevice) { hamilt::cal_stress_drhoc_aux_op()( r_d,rhoc_d,gx_arr_d+igl0,rab_d,drhocg_d+igl0,msh,igl0,rho_basis->ngg-igl0,ucell_in.omega,3); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, dvloc+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); + syncmem_var_d2h_op()(dvloc+igl0, drhocg_d+igl0, rho_basis->ngg-igl0); } else { hamilt::cal_stress_drhoc_aux_op()( diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp index 73b9e08a82..1af82ba153 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_nl.cpp @@ -30,8 +30,8 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, ModuleBase::timer::tick("Stress_Func", "stress_nl"); FPTYPE* stress_device = nullptr; - resmem_var_op()(this->ctx, stress_device, 9); - setmem_var_op()(this->ctx, stress_device, 0, 9); + resmem_var_op()(stress_device, 9); + setmem_var_op()(stress_device, 0, 9); std::vector sigmanlc(9, 0.0); hamilt::FS_Nonlocal_tools nl_tools(&nlpp_in, &ucell_in, p_kv, wfc_basis, p_sf, wg, &ekb); @@ -69,8 +69,8 @@ void Stress_Func::stress_nl(ModuleBase::matrix& sigma, } } // transfer stress from device to host - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigmanlc.data(), stress_device, 9); - delmem_var_op()(this->ctx, stress_device); + syncmem_var_d2h_op()(sigmanlc.data(), stress_device, 9); + delmem_var_op()(stress_device); // sum up forcenl from all processors for (int l = 0; l < 3; l++) { diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp index 8568821a10..acce052e83 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp @@ -22,8 +22,8 @@ void Stress_Func::stress_onsite(ModuleBase::matrix& sigma, ModuleBase::timer::tick("Stress_Func", "stress_onsite"); FPTYPE* stress_device = nullptr; - resmem_var_op()(this->ctx, stress_device, 9); - setmem_var_op()(this->ctx, stress_device, 0, 9); + resmem_var_op()(stress_device, 9); + setmem_var_op()(stress_device, 0, 9); std::vector sigma_onsite(9, 0.0); auto* onsite_p = projectors::OnsiteProjector::get_instance(); @@ -68,8 +68,8 @@ void Stress_Func::stress_onsite(ModuleBase::matrix& sigma, } } // transfer stress from device to host - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigma_onsite.data(), stress_device, 9); - delmem_var_op()(this->ctx, stress_device); + syncmem_var_d2h_op()(sigma_onsite.data(), stress_device, 9); + delmem_var_op()(stress_device); // sum up forcenl from all processors for (int l = 0; l < 3; l++) { diff --git a/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp b/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp index 24dcbe27ce..4e328c1fda 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/structure_factor.cpp @@ -27,19 +27,19 @@ Structure_Factor::~Structure_Factor() { if (device == "gpu") { if (PARAM.inp.precision == "single") { - delmem_cd_op()(gpu_ctx, this->c_eigts1); - delmem_cd_op()(gpu_ctx, this->c_eigts2); - delmem_cd_op()(gpu_ctx, this->c_eigts3); + delmem_cd_op()(this->c_eigts1); + delmem_cd_op()(this->c_eigts2); + delmem_cd_op()(this->c_eigts3); } - delmem_zd_op()(gpu_ctx, this->z_eigts1); - delmem_zd_op()(gpu_ctx, this->z_eigts2); - delmem_zd_op()(gpu_ctx, this->z_eigts3); + delmem_zd_op()(this->z_eigts1); + delmem_zd_op()(this->z_eigts2); + delmem_zd_op()(this->z_eigts3); } else { if (PARAM.inp.precision == "single") { - delmem_ch_op()(cpu_ctx, this->c_eigts1); - delmem_ch_op()(cpu_ctx, this->c_eigts2); - delmem_ch_op()(cpu_ctx, this->c_eigts3); + delmem_ch_op()(this->c_eigts1); + delmem_ch_op()(this->c_eigts2); + delmem_ch_op()(this->c_eigts3); } // There's no need to delete double precision pointers while in a CPU environment. } @@ -151,28 +151,28 @@ void Structure_Factor::setup_structure_factor(const UnitCell* Ucell, const Paral } if (device == "gpu") { if (PARAM.inp.precision == "single") { - resmem_cd_op()(gpu_ctx, this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1)); - resmem_cd_op()(gpu_ctx, this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1)); - resmem_cd_op()(gpu_ctx, this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1)); - castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1)); - castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1)); - castmem_z2c_h2d_op()(gpu_ctx, cpu_ctx, this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1)); + resmem_cd_op()(this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1)); + resmem_cd_op()(this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1)); + resmem_cd_op()(this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1)); + castmem_z2c_h2d_op()(this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1)); + castmem_z2c_h2d_op()(this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1)); + castmem_z2c_h2d_op()(this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1)); } - resmem_zd_op()(gpu_ctx, this->z_eigts1, Ucell->nat * (2 * rho_basis->nx + 1)); - resmem_zd_op()(gpu_ctx, this->z_eigts2, Ucell->nat * (2 * rho_basis->ny + 1)); - resmem_zd_op()(gpu_ctx, this->z_eigts3, Ucell->nat * (2 * rho_basis->nz + 1)); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1)); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1)); - syncmem_z2z_h2d_op()(gpu_ctx, cpu_ctx, this->z_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1)); + resmem_zd_op()(this->z_eigts1, Ucell->nat * (2 * rho_basis->nx + 1)); + resmem_zd_op()(this->z_eigts2, Ucell->nat * (2 * rho_basis->ny + 1)); + resmem_zd_op()(this->z_eigts3, Ucell->nat * (2 * rho_basis->nz + 1)); + syncmem_z2z_h2d_op()(this->z_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1)); + syncmem_z2z_h2d_op()(this->z_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1)); + syncmem_z2z_h2d_op()(this->z_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1)); } else { if (PARAM.inp.precision == "single") { - resmem_ch_op()(cpu_ctx, this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1)); - resmem_ch_op()(cpu_ctx, this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1)); - resmem_ch_op()(cpu_ctx, this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1)); - castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1)); - castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1)); - castmem_z2c_h2h_op()(cpu_ctx, cpu_ctx, this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1)); + resmem_ch_op()(this->c_eigts1, Ucell->nat * (2 * rho_basis->nx + 1)); + resmem_ch_op()(this->c_eigts2, Ucell->nat * (2 * rho_basis->ny + 1)); + resmem_ch_op()(this->c_eigts3, Ucell->nat * (2 * rho_basis->nz + 1)); + castmem_z2c_h2h_op()(this->c_eigts1, this->eigts1.c, Ucell->nat * (2 * rho_basis->nx + 1)); + castmem_z2c_h2h_op()(this->c_eigts2, this->eigts2.c, Ucell->nat * (2 * rho_basis->ny + 1)); + castmem_z2c_h2h_op()(this->c_eigts3, this->eigts3.c, Ucell->nat * (2 * rho_basis->nz + 1)); } this->z_eigts1 = this->eigts1.c; this->z_eigts2 = this->eigts2.c; diff --git a/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp b/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp index add76f6fb3..bca92ac1cf 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/structure_factor_k.cpp @@ -91,11 +91,11 @@ void Structure_Factor::get_sk(Device* ctx, } if (device == base_device::GpuDevice) { - resmem_int_op()(ctx, atom_na, ucell->ntype); - syncmem_int_op()(ctx, cpu_ctx, atom_na, h_atom_na, ucell->ntype); + resmem_int_op()(atom_na, ucell->ntype); + syncmem_int_op()(atom_na, h_atom_na, ucell->ntype); - resmem_var_op()(ctx, atom_tau, ucell->nat * 3); - syncmem_var_op()(ctx, cpu_ctx, atom_tau, h_atom_tau, ucell->nat * 3); + resmem_var_op()(atom_tau, ucell->nat * 3); + syncmem_var_op()(atom_tau, h_atom_tau, ucell->nat * 3); igl2isz = wfc_basis->d_igl2isz_k; is2fftixy = wfc_basis->d_is2fftixy; @@ -135,8 +135,8 @@ void Structure_Factor::get_sk(Device* ctx, sk); if (device == base_device::GpuDevice) { - delmem_int_op()(ctx, atom_na); - delmem_var_op()(ctx, atom_tau); + delmem_int_op()(atom_na); + delmem_var_op()(atom_tau); } delete[] h_atom_na; delete[] h_atom_tau; diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp index 23a5a18926..34e20977eb 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_che.cpp @@ -9,7 +9,7 @@ StoChe::~StoChe() { delete p_che; delete[] spolyv_cpu; - delmem_var_op()(this->ctx, spolyv); + delmem_var_op()(spolyv); } template @@ -20,12 +20,12 @@ StoChe::StoChe(const int& nche, const int& method, const REAL& ema p_che = new ModuleBase::Chebyshev(nche); if (method == 1) { - resmem_var_op()(this->ctx, spolyv, nche); + resmem_var_op()(spolyv, nche); spolyv_cpu = new REAL[nche]; } else { - resmem_var_op()(this->ctx, spolyv, nche * nche); + resmem_var_op()(spolyv, nche * nche); } this->emax_sto = emax_sto; diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_che.h b/source/module_hamilt_pw/hamilt_stodft/sto_che.h index 3a7d2f0090..f241553b66 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_che.h +++ b/source/module_hamilt_pw/hamilt_stodft/sto_che.h @@ -50,19 +50,17 @@ REAL vTMv(const REAL* v, const REAL* M, const int n) const int inc = 1; const REAL zero = 0; REAL* y = nullptr; - base_device::memory::resize_memory_op()(ctx, y, n); + base_device::memory::resize_memory_op()(y, n); hsolver::gemv_op()(ctx, normal, n, n, &one, M, n, v, inc, &zero, y, inc); REAL result = 0; REAL* dot_device = nullptr; - base_device::memory::resize_memory_op()(ctx, dot_device, 1); + base_device::memory::resize_memory_op()(dot_device, 1); container::kernels::blas_dot()(n, y, 1, v, 1, dot_device); - base_device::memory::synchronize_memory_op()(cpu_ctx, - ctx, - &result, + base_device::memory::synchronize_memory_op()(&result, dot_device, 1); - base_device::memory::delete_memory_op()(ctx, y); - base_device::memory::delete_memory_op()(ctx, dot_device); + base_device::memory::delete_memory_op()(y); + base_device::memory::delete_memory_op()(dot_device); return result; } diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp index db54e40db0..6684332781 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_forces.cpp @@ -217,8 +217,8 @@ void Sto_Forces::cal_sto_force_nl( // allocate memory for the force FPTYPE* force = nullptr; - resmem_var_op()(this->ctx, force, ucell.nat * 3); - base_device::memory::set_memory_op()(this->ctx, force, 0.0, ucell.nat * 3); + resmem_var_op()(force, ucell.nat * 3); + base_device::memory::set_memory_op()(force, 0.0, ucell.nat * 3); hamilt::FS_Nonlocal_tools nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr); @@ -250,8 +250,8 @@ void Sto_Forces::cal_sto_force_nl( nl_tools.cal_force(ik, max_nbands, nstobands, false, force, nksbands); } // end ik - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, forcenl.c, force, forcenl.nr * forcenl.nc); - delmem_var_op()(this->ctx, force); + syncmem_var_d2h_op()(forcenl.c, force, forcenl.nr * forcenl.nc); + delmem_var_op()(force); // sum up forcenl from all processors Parallel_Reduce::reduce_all(forcenl.c, forcenl.nr * forcenl.nc); diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp index ec4aa26c1c..8ec669febd 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_iter.cpp @@ -27,10 +27,10 @@ template void Stochastic_Iter::dot(const int& n, const Real* x, const int& incx, const Real* y, const int& incy, Real& result) { Real* result_device = nullptr; - resmem_var_op()(this->ctx, result_device, 1); + resmem_var_op()(result_device, 1); container::kernels::blas_dot()(n, p_che->coef_real, 1, spolyv, 1, result_device); - syncmem_var_d2h_op()(cpu_ctx, this->ctx, &result, result_device, 1); - delmem_var_op()(this->ctx, result_device); + syncmem_var_d2h_op()(&result, result_device, 1); + delmem_var_op()(result_device); } template @@ -65,7 +65,7 @@ void Stochastic_Iter::orthog(const int& ik, psi::Psi& psi, stowf.chi0->fix_k(ik); stowf.chiortho->fix_k(ik); T *wfgin = stowf.chi0->get_pointer(), *wfgout = stowf.chiortho->get_pointer(); - cpymem_complex_op()(this->ctx, this->ctx, wfgout, wfgin, npwx * nchipk); + cpymem_complex_op()(wfgout, wfgin, npwx * nchipk); // for (int ig = 0; ig < npwx * nchipk; ++ig) // { // wfgout[ig] = wfgin[ig]; @@ -73,7 +73,7 @@ void Stochastic_Iter::orthog(const int& ik, psi::Psi& psi, // orthogonal part T* sum = nullptr; - resmem_complex_op()(this->ctx, sum, PARAM.inp.nbands * nchipk); + resmem_complex_op()(sum, PARAM.inp.nbands * nchipk); char transC = 'C'; char transN = 'N'; @@ -109,7 +109,7 @@ void Stochastic_Iter::orthog(const int& ik, psi::Psi& psi, &ModuleBase::ONE, wfgout, npwx); - delmem_complex_op()(this->ctx, sum); + delmem_complex_op()(sum); } ModuleBase::timer::tick("Stochastic_Iter", "orthog"); } @@ -209,8 +209,8 @@ void Stochastic_Iter::check_precision(const double ref, const double { Real last_coef = 0; Real last_spolyv = 0; - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_coef, &p_che->coef_real[p_che->norder - 1], 1); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_spolyv, &spolyv[p_che->norder - 1], 1); + syncmem_var_d2h_op()(&last_coef, &p_che->coef_real[p_che->norder - 1], 1); + syncmem_var_d2h_op()(&last_spolyv, &spolyv[p_che->norder - 1], 1); error = last_coef * last_spolyv; } else @@ -220,8 +220,8 @@ void Stochastic_Iter::check_precision(const double ref, const double // double last_spolyv = spolyv[norder * norder - 1]; Real last_coef = 0; Real last_spolyv = 0; - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_coef, &p_che->coef_real[norder - 1], 1); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, &last_spolyv, &spolyv[norder * norder - 1], 1); + syncmem_var_d2h_op()(&last_coef, &p_che->coef_real[norder - 1], 1); + syncmem_var_d2h_op()(&last_spolyv, &spolyv[norder * norder - 1], 1); Real dot1 = 0, dot2 = 0; this->dot(norder, p_che->coef_real, 1, spolyv + norder * (norder - 1), 1, dot1); this->dot(norder, p_che->coef_real, 1, spolyv + norder - 1, norder, dot2); @@ -362,7 +362,7 @@ void Stochastic_Iter::calPn(const int& ik, Stochastic_WF& } else { - setmem_var_op()(this->ctx, spolyv, 0, norder * norder); + setmem_var_op()(spolyv, 0, norder * norder); } } T* pchi; @@ -391,7 +391,7 @@ void Stochastic_Iter::calPn(const int& ik, Stochastic_WF& } if(ik == this->pkv->get_nks() - 1) { - syncmem_var_h2d_op()(this->ctx, cpu_ctx, spolyv, spolyv_cpu, norder); + syncmem_var_h2d_op()(spolyv, spolyv_cpu, norder); } } else @@ -539,7 +539,7 @@ void Stochastic_Iter::sum_stoeband(Stochastic_WF& stowf, const int npw = this->pkv->ngk[ik]; const double kweight = this->pkv->wk[ik]; T* hshchi = nullptr; - resmem_complex_op()(this->ctx, hshchi, nchip_ik * npwx); + resmem_complex_op()(hshchi, nchip_ik * npwx); T* tmpin = stowf.shchi->get_pointer(); T* tmpout = hshchi; p_hamilt_sto->hPsi(tmpin, tmpout, nchip_ik); @@ -549,7 +549,7 @@ void Stochastic_Iter::sum_stoeband(Stochastic_WF& stowf, tmpin += npwx; tmpout += npwx; } - delmem_complex_op()(this->ctx, hshchi); + delmem_complex_op()(hshchi); } } #ifdef __MPI @@ -573,7 +573,7 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, const int nspin = PARAM.inp.nspin; T* porter = nullptr; - resmem_complex_op()(this->ctx, porter, nrxx); + resmem_complex_op()(porter, nrxx); std::vector sto_rho(nspin); for(int is = 0; is < nspin; ++is) @@ -597,7 +597,7 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, } for (int is = 0; is < nspin; is++) { - setmem_var_op()(this->ctx, pes->rho[is], 0, nrxx); + setmem_var_op()(pes->rho[is], 0, nrxx); } for (int ik = 0; ik < this->pkv->get_nks(); ++ik) { @@ -624,7 +624,7 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, if (PARAM.inp.device == "gpu" || PARAM.inp.precision == "single") { for(int is = 0; is < nspin; ++is) { - castmem_var_d2h_op()(this->cpu_ctx, this->ctx, sto_rho[is], pes->rho[is], nrxx); + castmem_var_d2h_op()(sto_rho[is], pes->rho[is], nrxx); } } else @@ -633,7 +633,7 @@ void Stochastic_Iter::cal_storho(const UnitCell& ucell, pes->rho = reinterpret_cast(pes->charge->rho); } - delmem_complex_op()(this->ctx, porter); + delmem_complex_op()(porter); #ifdef __MPI if(GlobalV::KPAR > 1) { @@ -735,11 +735,11 @@ void Stochastic_Iter::calTnchi_ik(const int& ik, Stochastic_WFnorder; T* coef_real = nullptr; - resmem_complex_op()(this->ctx, coef_real, N); - castmem_d2z_op()(this->ctx, this->ctx, coef_real, p_che->coef_real, p_che->norder); + resmem_complex_op()(coef_real, N); + castmem_d2z_op()(coef_real, p_che->coef_real, p_che->norder); gemv_op()(this->ctx, transa, M, N, &one, stowf.chiallorder[ik].get_pointer(), LDA, coef_real, inc, &zero, out, inc); // zgemv_(&transa, &M, &N, &one, stowf.chiallorder[ik].get_pointer(), &LDA, coef_real, &inc, &zero, out, &inc); - delmem_complex_op()(this->ctx, coef_real); + delmem_complex_op()(coef_real); } else { diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp index 5be294f2e7..62a4c16779 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_stress_pw.cpp @@ -167,8 +167,8 @@ void Sto_Stress_PW::sto_stress_nl(ModuleBase::matrix& sigma, // allocate memory for the stress FPTYPE* stress_device = nullptr; - resmem_var_op()(this->ctx, stress_device, 9); - setmem_var_op()(this->ctx, stress_device, 0, 9); + resmem_var_op()(stress_device, 9); + setmem_var_op()(stress_device, 0, 9); std::vector sigmanlc(9, 0.0); hamilt::FS_Nonlocal_tools nl_tools(&nlpp, &ucell, p_kv, wfc_basis, p_sf, wg, nullptr); @@ -201,8 +201,8 @@ void Sto_Stress_PW::sto_stress_nl(ModuleBase::matrix& sigma, } // transfer stress from device to host - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigmanlc.data(), stress_device, 9); - delmem_var_op()(this->ctx, stress_device); + syncmem_var_d2h_op()(sigmanlc.data(), stress_device, 9); + delmem_var_op()(stress_device); // sum up forcenl from all processors for (int l = 0; l < 3; l++) { diff --git a/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp b/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp index 8a76daa9e9..d3d720106c 100644 --- a/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp +++ b/source/module_hamilt_pw/hamilt_stodft/sto_wf.cpp @@ -52,7 +52,7 @@ void Stochastic_WF::allocate_chiallorder(const int& norder) for (int ik = 0; ik < this->nks; ++ik) { chiallorder[ik].resize(1, this->nchip[ik] * this->npwx, norder); - setmem_complex_op()(chiallorder[ik].get_device(), chiallorder[ik].get_pointer(), 0, chiallorder[ik].size()); + setmem_complex_op()(chiallorder[ik].get_pointer(), 0, chiallorder[ik].size()); } } @@ -374,9 +374,7 @@ void Stochastic_WF::sync_chi0() Device* ctx = {}; if (base_device::get_device_type(ctx) == base_device::GpuDevice) { - syncmem_h2d_op()(this->chi0->get_device(), - this->chi0_cpu->get_device(), - this->chi0->get_pointer(), + syncmem_h2d_op()(this->chi0->get_pointer(), this->chi0_cpu->get_pointer(), this->chi0_cpu->size()); } diff --git a/source/module_hsolver/diag_const_nums.cpp b/source/module_hsolver/diag_const_nums.cpp index 8b459cbf7c..4d9cb8fd83 100644 --- a/source/module_hsolver/diag_const_nums.cpp +++ b/source/module_hsolver/diag_const_nums.cpp @@ -11,14 +11,11 @@ template class const_nums>; template <> const_nums::const_nums() { - base_device::memory::resize_memory_op()( - this->cpu_ctx, this->zero, 1); + base_device::memory::resize_memory_op()(this->zero, 1); this->zero[0] = 0.0; - base_device::memory::resize_memory_op()( - this->cpu_ctx, this->one, 1); + base_device::memory::resize_memory_op()(this->one, 1); this->one[0] = 1.0; - base_device::memory::resize_memory_op()( - this->cpu_ctx, this->neg_one, 1); + base_device::memory::resize_memory_op()(this->neg_one, 1); this->neg_one[0] = -1.0; } @@ -26,14 +23,11 @@ const_nums::const_nums() template <> const_nums::const_nums() { - base_device::memory::resize_memory_op()( - this->cpu_ctx, this->zero, 1); + base_device::memory::resize_memory_op()(this->zero, 1); this->zero[0] = 0.0; - base_device::memory::resize_memory_op()( - this->cpu_ctx, this->one, 1); + base_device::memory::resize_memory_op()(this->one, 1); this->one[0] = 1.0; - base_device::memory::resize_memory_op()( - this->cpu_ctx, this->neg_one, 1); + base_device::memory::resize_memory_op()(this->neg_one, 1); this->neg_one[0] = -1.0; } @@ -41,14 +35,11 @@ const_nums::const_nums() template <> const_nums>::const_nums() { - base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()( - this->cpu_ctx, this->zero, 1); + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(this->zero, 1); this->zero[0] = std::complex(0.0, 0.0); - base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()( - this->cpu_ctx, this->one, 1); + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(this->one, 1); this->one[0] = std::complex(1.0, 0.0); - base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()( - this->cpu_ctx, this->neg_one, 1); + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(this->neg_one, 1); this->neg_one[0] = std::complex(-1.0, 0.0); } @@ -56,13 +47,10 @@ const_nums>::const_nums() template <> const_nums>::const_nums() { - base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()( - this->cpu_ctx, this->zero, 1); + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(this->zero, 1); this->zero[0] = std::complex(0.0, 0.0); - base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()( - this->cpu_ctx, this->one, 1); + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(this->one, 1); this->one[0] = std::complex(1.0, 0.0); - base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()( - this->cpu_ctx, this->neg_one, 1); + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(this->neg_one, 1); this->neg_one[0] = std::complex(-1.0, 0.0); } \ No newline at end of file diff --git a/source/module_hsolver/diago_dav_subspace.cpp b/source/module_hsolver/diago_dav_subspace.cpp index 82dadcb0d0..f7daf229a2 100644 --- a/source/module_hsolver/diago_dav_subspace.cpp +++ b/source/module_hsolver/diago_dav_subspace.cpp @@ -46,30 +46,30 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond // TODO: Added memory usage statistics //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - resmem_complex_op()(this->ctx, this->psi_in_iter, this->nbase_x * this->dim, "DAV::psi_in_iter"); - setmem_complex_op()(this->ctx, this->psi_in_iter, 0, this->nbase_x * this->dim); + resmem_complex_op()(this->psi_in_iter, this->nbase_x * this->dim, "DAV::psi_in_iter"); + setmem_complex_op()(this->psi_in_iter, 0, this->nbase_x * this->dim); // the product of H and psi in the reduced psi set - resmem_complex_op()(this->ctx, this->hphi, this->nbase_x * this->dim, "DAV::hphi"); - setmem_complex_op()(this->ctx, this->hphi, 0, this->nbase_x * this->dim); + resmem_complex_op()(this->hphi, this->nbase_x * this->dim, "DAV::hphi"); + setmem_complex_op()(this->hphi, 0, this->nbase_x * this->dim); // Hamiltonian on the reduced psi set - resmem_complex_op()(this->ctx, this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc"); - setmem_complex_op()(this->ctx, this->hcc, 0, this->nbase_x * this->nbase_x); + resmem_complex_op()(this->hcc, this->nbase_x * this->nbase_x, "DAV::hcc"); + setmem_complex_op()(this->hcc, 0, this->nbase_x * this->nbase_x); // Overlap on the reduced psi set - resmem_complex_op()(this->ctx, this->scc, this->nbase_x * this->nbase_x, "DAV::scc"); - setmem_complex_op()(this->ctx, this->scc, 0, this->nbase_x * this->nbase_x); + resmem_complex_op()(this->scc, this->nbase_x * this->nbase_x, "DAV::scc"); + setmem_complex_op()(this->scc, 0, this->nbase_x * this->nbase_x); // Eigenvectors - resmem_complex_op()(this->ctx, this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc"); - setmem_complex_op()(this->ctx, this->vcc, 0, this->nbase_x * this->nbase_x); + resmem_complex_op()(this->vcc, this->nbase_x * this->nbase_x, "DAV::vcc"); + setmem_complex_op()(this->vcc, 0, this->nbase_x * this->nbase_x); //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - resmem_real_op()(this->ctx, this->d_precondition, nbasis_in); + resmem_real_op()(this->d_precondition, nbasis_in); // syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition.data(), nbasis_in); } #endif @@ -78,17 +78,17 @@ Diago_DavSubspace::Diago_DavSubspace(const std::vector& precond template Diago_DavSubspace::~Diago_DavSubspace() { - delmem_complex_op()(this->ctx, this->psi_in_iter); + delmem_complex_op()(this->psi_in_iter); - delmem_complex_op()(this->ctx, this->hphi); - delmem_complex_op()(this->ctx, this->hcc); - delmem_complex_op()(this->ctx, this->scc); - delmem_complex_op()(this->ctx, this->vcc); + delmem_complex_op()(this->hphi); + delmem_complex_op()(this->hcc); + delmem_complex_op()(this->scc); + delmem_complex_op()(this->vcc); #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - delmem_real_op()(this->ctx, this->d_precondition); + delmem_real_op()(this->d_precondition); } #endif } @@ -123,9 +123,7 @@ int Diago_DavSubspace::diag_once(const HPsiFunc& hpsi_func, { unconv[m] = m; - syncmem_complex_op()(this->ctx, - this->ctx, - this->psi_in_iter + m * this->dim, + syncmem_complex_op()(this->psi_in_iter + m * this->dim, psi_in + m * psi_in_dmax, this->dim); } @@ -190,7 +188,7 @@ int Diago_DavSubspace::diag_once(const HPsiFunc& hpsi_func, ModuleBase::timer::tick("Diago_DavSubspace", "last"); // updata eigenvectors of Hamiltonian - setmem_complex_op()(this->ctx, psi_in, 0, n_band * psi_in_dmax); + setmem_complex_op()(psi_in, 0, n_band * psi_in_dmax); #ifdef __DSP gemm_op_mt() // In order to not coding another whole template, using this method to minimize the code change. @@ -228,9 +226,7 @@ int Diago_DavSubspace::diag_once(const HPsiFunc& hpsi_func, // update this->psi_in_iter according to psi_in for (size_t i = 0; i < this->n_band; i++) { - syncmem_complex_op()(this->ctx, - this->ctx, - this->psi_in_iter + i * this->dim, + syncmem_complex_op()(this->psi_in_iter + i * this->dim, psi_in + i * psi_in_dmax, this->dim); } @@ -273,7 +269,7 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, { if (unconv[i] != i) { - syncmem_complex_op()(this->ctx, this->ctx, vcc + i * this->nbase_x, vcc + unconv[i] * this->nbase_x, nbase); + syncmem_complex_op()(vcc + i * this->nbase_x, vcc + unconv[i] * this->nbase_x, nbase); (*eigenvalue_iter)[i] = (*eigenvalue_iter)[unconv[i]]; } } @@ -303,14 +299,14 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, if(this->device == base_device::GpuDevice) { e_temp_hd = nullptr; - resmem_real_op()(this->ctx, e_temp_hd, nbase); + resmem_real_op()(e_temp_hd, nbase); } for (int m = 0; m < notconv; m++) { e_temp_cpu.assign(nbase, (-1.0 * (*eigenvalue_iter)[m])); if (this->device == base_device::GpuDevice) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_hd, e_temp_cpu.data(), nbase); + syncmem_var_h2d_op()(e_temp_hd, e_temp_cpu.data(), nbase); } vector_mul_vector_op()(this->ctx, nbase, @@ -320,7 +316,7 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, } if(this->device == base_device::GpuDevice) { - delmem_real_op()(this->ctx, e_temp_hd); + delmem_real_op()(e_temp_hd); } #ifdef __DSP @@ -356,7 +352,7 @@ void Diago_DavSubspace::cal_grad(const HPsiFunc& hpsi_func, #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, pre.data(), this->dim); + syncmem_var_h2d_op()(this->d_precondition, pre.data(), this->dim); vector_div_vector_op()(this->ctx, this->dim, psi_iter + (nbase + m) * this->dim, @@ -461,7 +457,7 @@ void Diago_DavSubspace::cal_elem(const int& dim, #else auto* swap = new T[notconv * this->nbase_x]; - syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x); + syncmem_complex_op()(swap, hcc + nbase * this->nbase_x, notconv * this->nbase_x); if (std::is_same::value) { @@ -491,7 +487,7 @@ void Diago_DavSubspace::cal_elem(const int& dim, this->diag_comm.comm); } - syncmem_complex_op()(this->ctx, this->ctx, swap, scc + nbase * this->nbase_x, notconv * this->nbase_x); + syncmem_complex_op()(swap, scc + nbase * this->nbase_x, notconv * this->nbase_x); if (base_device::get_current_precision(swap) == "single") { @@ -544,33 +540,33 @@ void Diago_DavSubspace::diag_zhegvx(const int& nbase, if (this->diag_comm.rank == 0) { Real* eigenvalue_gpu = nullptr; - resmem_real_op()(this->ctx, eigenvalue_gpu, this->nbase_x); + resmem_real_op()(eigenvalue_gpu, this->nbase_x); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalue_gpu, (*eigenvalue_iter).data(), this->nbase_x); + syncmem_var_h2d_op()(eigenvalue_gpu, (*eigenvalue_iter).data(), this->nbase_x); T* hcc_gpu = nullptr; T* scc_gpu = nullptr; T* vcc_gpu = nullptr; - base_device::memory::resize_memory_op()(this->ctx, hcc_gpu, nbase * nbase); - base_device::memory::resize_memory_op()(this->ctx, scc_gpu, nbase * nbase); - base_device::memory::resize_memory_op()(this->ctx, vcc_gpu, nbase * nbase); + base_device::memory::resize_memory_op()(hcc_gpu, nbase * nbase); + base_device::memory::resize_memory_op()(scc_gpu, nbase * nbase); + base_device::memory::resize_memory_op()(vcc_gpu, nbase * nbase); for(int i=0;i()(this->ctx, this->ctx, hcc_gpu + i * nbase, hcc + i * nbase_x, nbase); - base_device::memory::synchronize_memory_op()(this->ctx, this->ctx, scc_gpu + i * nbase, scc + i * nbase_x, nbase); + base_device::memory::synchronize_memory_op()(hcc_gpu + i * nbase, hcc + i * nbase_x, nbase); + base_device::memory::synchronize_memory_op()(scc_gpu + i * nbase, scc + i * nbase_x, nbase); } dngvd_op()(this->ctx, nbase, nbase, hcc_gpu, scc_gpu, eigenvalue_gpu, vcc_gpu); for(int i=0;i()(this->ctx, this->ctx, vcc + i * nbase_x, vcc_gpu + i * nbase, nbase); + base_device::memory::synchronize_memory_op()(vcc + i * nbase_x, vcc_gpu + i * nbase, nbase); } - delmem_complex_op()(this->ctx, hcc_gpu); - delmem_complex_op()(this->ctx, scc_gpu); - delmem_complex_op()(this->ctx, vcc_gpu); + delmem_complex_op()(hcc_gpu); + delmem_complex_op()(scc_gpu); + delmem_complex_op()(vcc_gpu); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, (*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x); + syncmem_var_d2h_op()((*eigenvalue_iter).data(), eigenvalue_gpu, this->nbase_x); - delmem_real_op()(this->ctx, eigenvalue_gpu); + delmem_real_op()(eigenvalue_gpu); } #endif } @@ -715,16 +711,16 @@ void Diago_DavSubspace::refresh(const int& dim, this->dim); // update hphi - syncmem_complex_op()(this->ctx, this->ctx, hphi, psi_iter + nband * this->dim, this->dim * nband); + syncmem_complex_op()(hphi, psi_iter + nband * this->dim, this->dim * nband); nbase = nband; // set hcc/scc/vcc to 0 for (size_t i = 0; i < nbase; i++) { - setmem_complex_op()(this->ctx, &hcc[this->nbase_x * i], 0, nbase); - setmem_complex_op()(this->ctx, &scc[this->nbase_x * i], 0, nbase); - setmem_complex_op()(this->ctx, &vcc[this->nbase_x * i], 0, nbase); + setmem_complex_op()(&hcc[this->nbase_x * i], 0, nbase); + setmem_complex_op()(&scc[this->nbase_x * i], 0, nbase); + setmem_complex_op()(&vcc[this->nbase_x * i], 0, nbase); } if (this->device == base_device::GpuDevice) @@ -733,22 +729,19 @@ void Diago_DavSubspace::refresh(const int& dim, T* hcc_cpu = nullptr; T* scc_cpu = nullptr; T* vcc_cpu = nullptr; - base_device::memory::resize_memory_op()(this->cpu_ctx, - hcc_cpu, + base_device::memory::resize_memory_op()(hcc_cpu, this->nbase_x * this->nbase_x, "DAV::hcc"); - base_device::memory::resize_memory_op()(this->cpu_ctx, - scc_cpu, + base_device::memory::resize_memory_op()(scc_cpu, this->nbase_x * this->nbase_x, "DAV::scc"); - base_device::memory::resize_memory_op()(this->cpu_ctx, - vcc_cpu, + base_device::memory::resize_memory_op()(vcc_cpu, this->nbase_x * this->nbase_x, "DAV::vcc"); - syncmem_d2h_op()(this->cpu_ctx, this->ctx, hcc_cpu, hcc, this->nbase_x * this->nbase_x); - syncmem_d2h_op()(this->cpu_ctx, this->ctx, scc_cpu, scc, this->nbase_x * this->nbase_x); - syncmem_d2h_op()(this->cpu_ctx, this->ctx, vcc_cpu, vcc, this->nbase_x * this->nbase_x); + syncmem_d2h_op()(hcc_cpu, hcc, this->nbase_x * this->nbase_x); + syncmem_d2h_op()(scc_cpu, scc, this->nbase_x * this->nbase_x); + syncmem_d2h_op()(vcc_cpu, vcc, this->nbase_x * this->nbase_x); for (int i = 0; i < nbase; i++) { @@ -757,13 +750,13 @@ void Diago_DavSubspace::refresh(const int& dim, vcc_cpu[i * this->nbase_x + i] = this->one[0]; } - syncmem_h2d_op()(this->ctx, this->cpu_ctx, hcc, hcc_cpu, this->nbase_x * this->nbase_x); - syncmem_h2d_op()(this->ctx, this->cpu_ctx, scc, scc_cpu, this->nbase_x * this->nbase_x); - syncmem_h2d_op()(this->ctx, this->cpu_ctx, vcc, vcc_cpu, this->nbase_x * this->nbase_x); + syncmem_h2d_op()(hcc, hcc_cpu, this->nbase_x * this->nbase_x); + syncmem_h2d_op()(scc, scc_cpu, this->nbase_x * this->nbase_x); + syncmem_h2d_op()(vcc, vcc_cpu, this->nbase_x * this->nbase_x); - base_device::memory::delete_memory_op()(this->cpu_ctx, hcc_cpu); - base_device::memory::delete_memory_op()(this->cpu_ctx, scc_cpu); - base_device::memory::delete_memory_op()(this->cpu_ctx, vcc_cpu); + base_device::memory::delete_memory_op()(hcc_cpu); + base_device::memory::delete_memory_op()(scc_cpu); + base_device::memory::delete_memory_op()(vcc_cpu); #endif } else diff --git a/source/module_hsolver/diago_david.cpp b/source/module_hsolver/diago_david.cpp index b4805a82fa..6afaf998b8 100644 --- a/source/module_hsolver/diago_david.cpp +++ b/source/module_hsolver/diago_david.cpp @@ -59,47 +59,45 @@ DiagoDavid::DiagoDavid(const Real* precondition_in, /// - "basis" : number of occupied ks-orbitals(subscripts i,j) * number of unoccupied ks-orbitals(subscripts a,b), corresponding to "bands" of the ground state // the lowest N eigenvalues - base_device::memory::resize_memory_op()( - this->cpu_ctx, this->eigenvalue, nbase_x, "DAV::eig"); - base_device::memory::set_memory_op()( - this->cpu_ctx, this->eigenvalue, 0, nbase_x); + base_device::memory::resize_memory_op()(this->eigenvalue, nbase_x, "DAV::eig"); + base_device::memory::set_memory_op()(this->eigenvalue, 0, nbase_x); // basis(dim, nbase_x), leading dimension = dim - resmem_complex_op()(this->ctx, basis, nbase_x * dim, "DAV::basis"); - setmem_complex_op()(this->ctx, basis, 0, nbase_x * dim); + resmem_complex_op()(basis, nbase_x * dim, "DAV::basis"); + setmem_complex_op()(basis, 0, nbase_x * dim); //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< // hpsi(nbase_x, dim); // the product of H and psi in the reduced basis set - resmem_complex_op()(this->ctx, this->hpsi, nbase_x * dim, "DAV::hpsi"); - setmem_complex_op()(this->ctx, this->hpsi, 0, nbase_x * dim); + resmem_complex_op()(this->hpsi, nbase_x * dim, "DAV::hpsi"); + setmem_complex_op()(this->hpsi, 0, nbase_x * dim); // spsi(nbase_x, dim); // the Product of S and psi in the reduced basis set - resmem_complex_op()(this->ctx, this->spsi, nbase_x * dim, "DAV::spsi"); - setmem_complex_op()(this->ctx, this->spsi, 0, nbase_x * dim); + resmem_complex_op()(this->spsi, nbase_x * dim, "DAV::spsi"); + setmem_complex_op()(this->spsi, 0, nbase_x * dim); // hcc(nbase_x, nbase_x); // Hamiltonian on the reduced basis - resmem_complex_op()(this->ctx, this->hcc, nbase_x * nbase_x, "DAV::hcc"); - setmem_complex_op()(this->ctx, this->hcc, 0, nbase_x * nbase_x); + resmem_complex_op()(this->hcc, nbase_x * nbase_x, "DAV::hcc"); + setmem_complex_op()(this->hcc, 0, nbase_x * nbase_x); // scc(nbase_x, nbase_x); // Overlap on the reduced basis // resmem_complex_op()(this->ctx, this->scc, nbase_x * nbase_x, "DAV::scc"); // setmem_complex_op()(this->ctx, this->scc, 0, nbase_x * nbase_x); // vcc(nbase_x, nbase_x); // Eigenvectors of hcc - resmem_complex_op()(this->ctx, this->vcc, nbase_x * nbase_x, "DAV::vcc"); - setmem_complex_op()(this->ctx, this->vcc, 0, nbase_x * nbase_x); + resmem_complex_op()(this->vcc, nbase_x * nbase_x, "DAV::vcc"); + setmem_complex_op()(this->vcc, 0, nbase_x * nbase_x); //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< // lagrange_matrix(nband, nband); // for orthogonalization - resmem_complex_op()(this->ctx, this->lagrange_matrix, nband * nband); - setmem_complex_op()(this->ctx, this->lagrange_matrix, 0, nband * nband); + resmem_complex_op()(this->lagrange_matrix, nband * nband); + setmem_complex_op()(this->lagrange_matrix, 0, nband * nband); #if defined(__CUDA) || defined(__ROCM) // device precondition array if (this->device == base_device::GpuDevice) { - resmem_var_op()(this->ctx, this->d_precondition, dim); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, this->d_precondition, this->precondition, dim); + resmem_var_op()(this->d_precondition, dim); + syncmem_var_h2d_op()(this->d_precondition, this->precondition, dim); } #endif } @@ -107,19 +105,19 @@ DiagoDavid::DiagoDavid(const Real* precondition_in, template DiagoDavid::~DiagoDavid() { - delmem_complex_op()(this->ctx, this->basis); - delmem_complex_op()(this->ctx, this->hpsi); - delmem_complex_op()(this->ctx, this->spsi); - delmem_complex_op()(this->ctx, this->hcc); + delmem_complex_op()(this->basis); + delmem_complex_op()(this->hpsi); + delmem_complex_op()(this->spsi); + delmem_complex_op()(this->hcc); // delmem_complex_op()(this->ctx, this->scc); - delmem_complex_op()(this->ctx, this->vcc); - delmem_complex_op()(this->ctx, this->lagrange_matrix); - base_device::memory::delete_memory_op()(this->cpu_ctx, this->eigenvalue); + delmem_complex_op()(this->vcc); + delmem_complex_op()(this->lagrange_matrix); + base_device::memory::delete_memory_op()(this->eigenvalue); // If the device is a GPU device, free the d_precondition array. #if defined(__CUDA) || defined(__ROCM) if (this->device == base_device::GpuDevice) { - delmem_var_op()(this->ctx, this->d_precondition); + delmem_var_op()(this->d_precondition); } #endif } @@ -181,7 +179,7 @@ int DiagoDavid::diag_once(const HPsiFunc& hpsi_func, // begin SchmidtOrth for (int m = 0; m < nband; m++) { - syncmem_complex_op()(this->ctx, this->ctx, basis + dim*m, psi_in + m*ld_psi, dim); + syncmem_complex_op()(basis + dim*m, psi_in + m*ld_psi, dim); this->SchmidtOrth(dim, nband, @@ -266,7 +264,7 @@ int DiagoDavid::diag_once(const HPsiFunc& hpsi_func, // update eigenvectors of Hamiltonian - setmem_complex_op()(this->ctx, psi_in, 0, nband * ld_psi); + setmem_complex_op()(psi_in, 0, nband * ld_psi); //<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< gemm_op()(this->ctx, 'N', @@ -353,8 +351,8 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // vc_ev_vector(notconv, nbase); // eigenvectors of unconverged index extracted from vcc T* vc_ev_vector = nullptr; - resmem_complex_op()(this->ctx, vc_ev_vector, notconv * nbase); - setmem_complex_op()(this->ctx, vc_ev_vector, 0, notconv * nbase); + resmem_complex_op()(vc_ev_vector, notconv * nbase); + setmem_complex_op()(vc_ev_vector, 0, notconv * nbase); //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> // for (int m = 0; m < notconv; m++) @@ -372,9 +370,7 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // vc_ev_vector[m * nbase + i] = vcc[i * nbase_x + unconv[m]]; for (int m = 0; m < notconv; m++) { - syncmem_complex_op()(this->ctx, - this->ctx, - vc_ev_vector + m * nbase, + syncmem_complex_op()(vc_ev_vector + m * nbase, vcc + unconv[m] * nbase_x, nbase); } @@ -419,14 +415,14 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, { #if defined(__CUDA) || defined(__ROCM) Real* e_temp_gpu = nullptr; - resmem_var_op()(this->ctx, e_temp_gpu, nbase); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, e_temp_gpu, e_temp_cpu.data(), nbase); + resmem_var_op()(e_temp_gpu, nbase); + syncmem_var_h2d_op()(e_temp_gpu, e_temp_cpu.data(), nbase); vector_mul_vector_op()(this->ctx, nbase, vc_ev_vector + m * nbase, vc_ev_vector + m * nbase, e_temp_gpu); - delmem_var_op()(this->ctx, e_temp_gpu); + delmem_var_op()(e_temp_gpu); #endif } else @@ -499,8 +495,8 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // there is a nbase to nbase + notconv band orthogonalise // plan for SchmidtOrth T* lagrange = nullptr; - resmem_complex_op()(this->ctx, lagrange, notconv * (nbase + notconv)); - setmem_complex_op()(this->ctx, lagrange, 0, notconv * (nbase + notconv)); + resmem_complex_op()(lagrange, notconv * (nbase + notconv)); + setmem_complex_op()(lagrange, 0, notconv * (nbase + notconv)); std::vector pre_matrix_mm_m(notconv, 0); std::vector pre_matrix_mv_m(notconv, 1); @@ -569,8 +565,8 @@ void DiagoDavid::cal_grad(const HPsiFunc& hpsi_func, // hpsi[:, nbase:nbase+notcnv] = H basis[:, nbase:nbase+notcnv] hpsi_func(basis + nbase * dim, hpsi + nbase * dim, dim, notconv); - delmem_complex_op()(this->ctx, lagrange); - delmem_complex_op()(this->ctx, vc_ev_vector); + delmem_complex_op()(lagrange); + delmem_complex_op()(vc_ev_vector); ModuleBase::timer::tick("DiagoDavid", "cal_grad"); return; @@ -635,7 +631,7 @@ void DiagoDavid::cal_elem(const int& dim, // matrixTranspose_op()(this->ctx, nbase_x, nbase_x, scc, scc); auto* swap = new T[notconv * nbase_x]; - syncmem_complex_op()(this->ctx, this->ctx, swap, hcc + nbase * nbase_x, notconv * nbase_x); + syncmem_complex_op()(swap, hcc + nbase * nbase_x, notconv * nbase_x); if (std::is_same::value) { Parallel_Reduce::reduce_pool(hcc + nbase * nbase_x, notconv * nbase_x); @@ -700,13 +696,13 @@ void DiagoDavid::diag_zhegvx(const int& nbase, { #if defined(__CUDA) || defined(__ROCM) Real* eigenvalue_gpu = nullptr; - resmem_var_op()(this->ctx, eigenvalue_gpu, nbase_x); - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, eigenvalue_gpu, this->eigenvalue, nbase_x); + resmem_var_op()(eigenvalue_gpu, nbase_x); + syncmem_var_h2d_op()(eigenvalue_gpu, this->eigenvalue, nbase_x); dnevx_op()(this->ctx, nbase, nbase_x, hcc, nband, eigenvalue_gpu, vcc); - syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, this->eigenvalue, eigenvalue_gpu, nbase_x); - delmem_var_op()(this->ctx, eigenvalue_gpu); + syncmem_var_d2h_op()(this->eigenvalue, eigenvalue_gpu, nbase_x); + delmem_var_op()(eigenvalue_gpu); #endif } else @@ -752,7 +748,7 @@ void DiagoDavid::refresh(const int& dim, ModuleBase::timer::tick("DiagoDavid", "refresh"); // update hp,sp - setmem_complex_op()(this->ctx, basis , 0, nbase_x * dim); + setmem_complex_op()(basis , 0, nbase_x * dim); // basis(dim, nband) = hpsi(dim, nbase) * vcc(nbase, nband) gemm_op()(this->ctx, @@ -790,8 +786,8 @@ void DiagoDavid::refresh(const int& dim, ); // hpsi = basis, spsi = basis[nband] - syncmem_complex_op()(this->ctx, this->ctx, hpsi, basis, dim * nband); - syncmem_complex_op()(this->ctx, this->ctx, spsi, basis + dim*nband, dim * nband); + syncmem_complex_op()(hpsi, basis, dim * nband); + syncmem_complex_op()(spsi, basis + dim*nband, dim * nband); /*for (int m = 0; m < nband; m++) { for (int ig = 0; ig < dim; ig++) { @@ -801,11 +797,11 @@ void DiagoDavid::refresh(const int& dim, }*/ // update basis - setmem_complex_op()(this->ctx, basis , 0, nbase_x * dim); + setmem_complex_op()(basis , 0, nbase_x * dim); for (int m = 0; m < nband; m++) { - syncmem_complex_op()(this->ctx, this->ctx, basis + dim*m,psi_in + m*ld_psi, dim); + syncmem_complex_op()(basis + dim*m,psi_in + m*ld_psi, dim); /*for (int ig = 0; ig < npw; ig++) basis(m, ig) = psi(m, ig);*/ } @@ -814,7 +810,7 @@ void DiagoDavid::refresh(const int& dim, // basis set size reset to nband nbase = nband; - setmem_complex_op()(this->ctx, hcc, 0, nbase_x * nbase_x); + setmem_complex_op()(hcc, 0, nbase_x * nbase_x); // setmem_complex_op()(this->ctx, scc, 0, nbase_x * nbase_x); @@ -824,22 +820,20 @@ void DiagoDavid::refresh(const int& dim, T* hcc_cpu = nullptr; // T* scc_cpu = nullptr; T* vcc_cpu = nullptr; - base_device::memory::resize_memory_op()(this->cpu_ctx, - hcc_cpu, + base_device::memory::resize_memory_op()(hcc_cpu, nbase_x * nbase_x, "DAV::hcc"); // base_device::memory::resize_memory_op()(this->cpu_ctx, // scc_cpu, // nbase_x * nbase_x, // "DAV::scc"); - base_device::memory::resize_memory_op()(this->cpu_ctx, - vcc_cpu, + base_device::memory::resize_memory_op()(vcc_cpu, nbase_x * nbase_x, "DAV::vcc"); - syncmem_d2h_op()(this->cpu_ctx, this->ctx, hcc_cpu, hcc, nbase_x * nbase_x); + syncmem_d2h_op()(hcc_cpu, hcc, nbase_x * nbase_x); // syncmem_d2h_op()(this->cpu_ctx, this->ctx, scc_cpu, scc, nbase_x * nbase_x); - syncmem_d2h_op()(this->cpu_ctx, this->ctx, vcc_cpu, vcc, nbase_x * nbase_x); + syncmem_d2h_op()(vcc_cpu, vcc, nbase_x * nbase_x); for (int i = 0; i < nbase; i++) { @@ -848,13 +842,13 @@ void DiagoDavid::refresh(const int& dim, vcc_cpu[i * nbase_x + i] = this->one[0]; } - syncmem_h2d_op()(this->ctx, this->cpu_ctx, hcc, hcc_cpu, nbase_x * nbase_x); + syncmem_h2d_op()(hcc, hcc_cpu, nbase_x * nbase_x); // syncmem_h2d_op()(this->ctx, this->cpu_ctx, scc, scc_cpu, nbase_x * nbase_x); - syncmem_h2d_op()(this->ctx, this->cpu_ctx, vcc, vcc_cpu, nbase_x * nbase_x); + syncmem_h2d_op()(vcc, vcc_cpu, nbase_x * nbase_x); - base_device::memory::delete_memory_op()(this->cpu_ctx, hcc_cpu); + base_device::memory::delete_memory_op()(hcc_cpu); // base_device::memory::delete_memory_op()(this->cpu_ctx, scc_cpu); - base_device::memory::delete_memory_op()(this->cpu_ctx, vcc_cpu); + base_device::memory::delete_memory_op()(vcc_cpu); #endif } else @@ -941,7 +935,7 @@ void DiagoDavid::SchmidtOrth(const int& dim, Parallel_Reduce::reduce_pool(lagrange_m, m + 1); T var = *this->zero; - syncmem_d2h_op()(this->cpu_ctx, this->ctx, &var, lagrange_m + m, 1); + syncmem_d2h_op()(&var, lagrange_m + m, 1); double psi_norm = get_real(var); assert(psi_norm > 0.0); diff --git a/source/module_hsolver/diago_iter_assist.cpp b/source/module_hsolver/diago_iter_assist.cpp index c05ecdf8ec..5a3acf8e53 100644 --- a/source/module_hsolver/diago_iter_assist.cpp +++ b/source/module_hsolver/diago_iter_assist.cpp @@ -42,12 +42,12 @@ void DiagoIterAssist::diagH_subspace(const hamilt::Hamilt* assert(n_band <= nstart); T *hcc = nullptr, *scc = nullptr, *vcc = nullptr; - resmem_complex_op()(ctx, hcc, nstart * nstart, "DiagSub::hcc"); - resmem_complex_op()(ctx, scc, nstart * nstart, "DiagSub::scc"); - resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc"); - setmem_complex_op()(ctx, hcc, 0, nstart * nstart); - setmem_complex_op()(ctx, scc, 0, nstart * nstart); - setmem_complex_op()(ctx, vcc, 0, nstart * nstart); + resmem_complex_op()(hcc, nstart * nstart, "DiagSub::hcc"); + resmem_complex_op()(scc, nstart * nstart, "DiagSub::scc"); + resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc"); + setmem_complex_op()(hcc, 0, nstart * nstart); + setmem_complex_op()(scc, 0, nstart * nstart); + setmem_complex_op()(vcc, 0, nstart * nstart); const int dmin = psi.get_current_ngk(); const int dmax = psi.get_nbasis(); @@ -61,11 +61,11 @@ void DiagoIterAssist::diagH_subspace(const hamilt::Hamilt* } else { - resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp"); + resmem_complex_op()(temp, nstart * dmax, "DiagSub::temp"); } { // code block to calculate hcc and scc - setmem_complex_op()(ctx, temp, 0, nstart * dmax); + setmem_complex_op()(temp, 0, nstart * dmax); T* hphi = temp; // do hPsi for all bands @@ -140,11 +140,11 @@ void DiagoIterAssist::diagH_subspace(const hamilt::Hamilt* if (!in_place) { matrixSetToAnother()(ctx, n_band, temp, ld_temp, evc.get_pointer(), dmax); - delmem_complex_op()(ctx, temp); + delmem_complex_op()(temp); } - delmem_complex_op()(ctx, hcc); - delmem_complex_op()(ctx, scc); - delmem_complex_op()(ctx, vcc); + delmem_complex_op()(hcc); + delmem_complex_op()(scc); + delmem_complex_op()(vcc); ModuleBase::timer::tick("DiagoIterAssist", "diagH_subspace"); } @@ -192,12 +192,12 @@ void DiagoIterAssist::diagH_subspace_init(hamilt::Hamilt* // ModuleBase::ComplexMatrix sc(nstart, nstart); // ModuleBase::ComplexMatrix hvec(nstart, n_band); T *hcc = nullptr, *scc = nullptr, *vcc = nullptr; - resmem_complex_op()(ctx, hcc, nstart * nstart, "DiagSub::hcc"); - resmem_complex_op()(ctx, scc, nstart * nstart, "DiagSub::scc"); - resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc"); - setmem_complex_op()(ctx, hcc, 0, nstart * nstart); - setmem_complex_op()(ctx, scc, 0, nstart * nstart); - setmem_complex_op()(ctx, vcc, 0, nstart * nstart); + resmem_complex_op()(hcc, nstart * nstart, "DiagSub::hcc"); + resmem_complex_op()(scc, nstart * nstart, "DiagSub::scc"); + resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc"); + setmem_complex_op()(hcc, 0, nstart * nstart); + setmem_complex_op()(scc, 0, nstart * nstart); + setmem_complex_op()(vcc, 0, nstart * nstart); if (base_device::get_device_type(ctx) == base_device::GpuDevice) { @@ -206,15 +206,15 @@ void DiagoIterAssist::diagH_subspace_init(hamilt::Hamilt* T* ppsi = psi_temp.get_pointer(); // hpsi and spsi share the temp space T* temp = nullptr; - resmem_complex_op()(ctx, temp, psi_nc, "DiagSub::temp"); - setmem_complex_op()(ctx, temp, 0, psi_nc); + resmem_complex_op()(temp, psi_nc, "DiagSub::temp"); + setmem_complex_op()(temp, 0, psi_nc); T* hpsi = temp; // do hPsi band by band for (int i = 0; i < nstart; i++) { // psi_temp is one band psi, psi is all bands psi, the range always is 1 for the only band in psi_temp - syncmem_complex_op()(ctx, ctx, ppsi, psi + i * psi_nc, psi_nc); + syncmem_complex_op()(ppsi, psi + i * psi_nc, psi_nc); psi::Range band_by_band_range(true, 0, 0, 0); hpsi_info hpsi_in(&psi_temp, band_by_band_range, hpsi); @@ -229,7 +229,7 @@ void DiagoIterAssist::diagH_subspace_init(hamilt::Hamilt* // do sPsi band by band for (int i = 0; i < nstart; i++) { - syncmem_complex_op()(ctx, ctx, ppsi, psi + i * psi_nc, psi_nc); + syncmem_complex_op()(ppsi, psi + i * psi_nc, psi_nc); pHamilt->sPsi(ppsi, spsi, dmin, dmin, 1); gemv_op()(ctx, @@ -245,18 +245,18 @@ void DiagoIterAssist::diagH_subspace_init(hamilt::Hamilt* scc + i * nstart, 1); } - delmem_complex_op()(ctx, temp); + delmem_complex_op()(temp); } else if (base_device::get_device_type(ctx) == base_device::CpuDevice) { psi::Psi psi_temp(1, nstart, psi_nc, dmin, true); T* ppsi = psi_temp.get_pointer(); - syncmem_complex_op()(ctx, ctx, ppsi, psi, psi_temp.size()); + syncmem_complex_op()(ppsi, psi, psi_temp.size()); // hpsi and spsi share the temp space T* temp = nullptr; - resmem_complex_op()(ctx, temp, nstart * psi_nc, "DiagSub::temp"); - setmem_complex_op()(ctx, temp, 0, nstart * psi_nc); + resmem_complex_op()(temp, nstart * psi_nc, "DiagSub::temp"); + setmem_complex_op()(temp, 0, nstart * psi_nc); T* hpsi = temp; // do hPsi for all bands @@ -271,7 +271,7 @@ void DiagoIterAssist::diagH_subspace_init(hamilt::Hamilt* pHamilt->sPsi(ppsi, spsi, psi_temp.get_nbasis(), psi_temp.get_nbasis(), psi_temp.get_nbands()); gemm_op()(ctx, 'C', 'N', nstart, nstart, dmin, &one, ppsi, dmax, spsi, dmax, &zero, scc, nstart); - delmem_complex_op()(ctx, temp); + delmem_complex_op()(temp); add_to_hcc(hcc, nstart); @@ -358,9 +358,9 @@ void DiagoIterAssist::diagH_subspace_init(hamilt::Hamilt* // delmem_complex_op()(ctx, evctemp); } - delmem_complex_op()(ctx, hcc); - delmem_complex_op()(ctx, scc); - delmem_complex_op()(ctx, vcc); + delmem_complex_op()(hcc); + delmem_complex_op()(scc); + delmem_complex_op()(vcc); ModuleBase::timer::tick("DiagoIterAssist", "diagH_subspace_init"); } @@ -377,8 +377,8 @@ void DiagoIterAssist::diagH_LAPACK(const int nstart, ModuleBase::timer::tick("DiagoIterAssist", "diagH_LAPACK"); Real* eigenvalues = nullptr; - resmem_var_op()(ctx, eigenvalues, nstart); - setmem_var_op()(ctx, eigenvalues, 0, nstart); + resmem_var_op()(eigenvalues, nstart); + setmem_var_op()(eigenvalues, 0, nstart); dngvd_op()(ctx, nstart, ldh, hcc, scc, eigenvalues, vcc); @@ -386,16 +386,16 @@ void DiagoIterAssist::diagH_LAPACK(const int nstart, { #if ((defined __CUDA) || (defined __ROCM)) // set eigenvalues in GPU to e in CPU - syncmem_var_d2h_op()(cpu_ctx, gpu_ctx, e, eigenvalues, nbands); + syncmem_var_d2h_op()(e, eigenvalues, nbands); #endif } else if (base_device::get_device_type(ctx) == base_device::CpuDevice) { // set eigenvalues in CPU to e in CPU - syncmem_var_op()(ctx, ctx, e, eigenvalues, nbands); + syncmem_var_op()(e, eigenvalues, nbands); } - delmem_var_op()(ctx, eigenvalues); + delmem_var_op()(eigenvalues); // const bool all_eigenvalues = (nstart == nbands); // if (all_eigenvalues) { @@ -423,18 +423,18 @@ void DiagoIterAssist::cal_hs_subspace(const hamilt::Hamilt { const int nstart = psi.get_nbands(); - setmem_complex_op()(ctx, hcc, 0, nstart * nstart); - setmem_complex_op()(ctx, scc, 0, nstart * nstart); + setmem_complex_op()(hcc, 0, nstart * nstart); + setmem_complex_op()(scc, 0, nstart * nstart); const int dmin = psi.get_current_ngk(); const int dmax = psi.get_nbasis(); T* temp = nullptr; - resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp"); - setmem_complex_op()(ctx, temp, 0, nstart * dmax); + resmem_complex_op()(temp, nstart * dmax, "DiagSub::temp"); + setmem_complex_op()(temp, 0, nstart * dmax); { // code block to calculate hcc and scc - setmem_complex_op()(ctx, temp, 0, nstart * dmax); + setmem_complex_op()(temp, 0, nstart * dmax); T* hphi = temp; // do hPsi for all bands @@ -483,7 +483,7 @@ void DiagoIterAssist::cal_hs_subspace(const hamilt::Hamilt Parallel_Reduce::reduce_pool(scc, nstart * nstart); } - delmem_complex_op()(ctx, temp); + delmem_complex_op()(temp); } template @@ -502,8 +502,8 @@ void DiagoIterAssist::diag_responce( const T* hcc, const int nstart = nbands; T *vcc = nullptr; - resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc"); - setmem_complex_op()(ctx, vcc, 0, nstart * nstart); + resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc"); + setmem_complex_op()(vcc, 0, nstart * nstart); // after generation of H and S matrix, diag them DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc); @@ -525,7 +525,7 @@ void DiagoIterAssist::diag_responce( const T* hcc, mat_col); } - delmem_complex_op()(ctx, vcc); + delmem_complex_op()(vcc); ModuleBase::timer::tick("DiagoIterAssist", "diag_responce"); } @@ -545,8 +545,8 @@ void DiagoIterAssist::diag_subspace_psi(const T* hcc, const int n_band = evc.get_nbands(); T *vcc = nullptr; - resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc"); - setmem_complex_op()(ctx, vcc, 0, nstart * nstart); + resmem_complex_op()(vcc, nstart * nstart, "DiagSub::vcc"); + setmem_complex_op()(vcc, 0, nstart * nstart); // after generation of H and S matrix, diag them DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc); @@ -555,8 +555,8 @@ void DiagoIterAssist::diag_subspace_psi(const T* hcc, const int dmin = evc.get_current_ngk(); const int dmax = evc.get_nbasis(); T* temp = nullptr; - resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp"); - setmem_complex_op()(ctx, temp, 0, nstart * dmax); + resmem_complex_op()(temp, nstart * dmax, "DiagSub::temp"); + setmem_complex_op()(temp, 0, nstart * dmax); gemm_op()(ctx, 'N', 'N', @@ -572,10 +572,10 @@ void DiagoIterAssist::diag_subspace_psi(const T* hcc, temp, dmin); matrixSetToAnother()(ctx, n_band, temp, dmin, evc.get_pointer(), dmax); - delmem_complex_op()(ctx, temp); + delmem_complex_op()(temp); } - delmem_complex_op()(ctx, vcc); + delmem_complex_op()(vcc); ModuleBase::timer::tick("DiagoIterAssist", "diag_subspace_psi"); } diff --git a/source/module_hsolver/hsolver_lcaopw.cpp b/source/module_hsolver/hsolver_lcaopw.cpp index 059318034a..b6e95b4c03 100644 --- a/source/module_hsolver/hsolver_lcaopw.cpp +++ b/source/module_hsolver/hsolver_lcaopw.cpp @@ -270,8 +270,6 @@ void HSolverLIP::solve(hamilt::Hamilt* pHamilt, // ESolver_KS_PW::p_hamilt /// calculate the contribution of Psi for charge density rho } base_device::memory::cast_memory_op()( - cpu_ctx, - cpu_ctx, pes->ekb.c, eigenvalues.data(), pes->ekb.nr * pes->ekb.nc); diff --git a/source/module_hsolver/hsolver_pw.cpp b/source/module_hsolver/hsolver_pw.cpp index de627d3474..05ccc8acd0 100644 --- a/source/module_hsolver/hsolver_pw.cpp +++ b/source/module_hsolver/hsolver_pw.cpp @@ -329,8 +329,6 @@ void HSolverPW::solve(hamilt::Hamilt* pHamilt, // copy eigenvalues to ekb in ElecState base_device::memory::cast_memory_op()( - cpu_ctx, - cpu_ctx, // pes->ekb.c, out_eigenvalues, eigenvalues.data(), @@ -450,8 +448,6 @@ void HSolverPW::hamiltSolvePsiK(hamilt::Hamilt* hm, else { base_device::memory::synchronize_memory_op()( - this->ctx, - this->ctx, spsi_out.data(), psi_in.data(), static_cast((ndim == 1 ? 1 : psi_in.shape().dim_size(0)) diff --git a/source/module_hsolver/kernels/cuda/math_kernel_op.cu b/source/module_hsolver/kernels/cuda/math_kernel_op.cu index 70ed5ebf0b..cd3ac41812 100644 --- a/source/module_hsolver/kernels/cuda/math_kernel_op.cu +++ b/source/module_hsolver/kernels/cuda/math_kernel_op.cu @@ -887,7 +887,7 @@ void matrixTranspose_op::operator()(const base_ double* output_matrix) { double* device_temp = nullptr; - base_device::memory::resize_memory_op()(d, device_temp, row * col); + base_device::memory::resize_memory_op()(device_temp, row * col); if (row == col) { @@ -906,13 +906,11 @@ void matrixTranspose_op::operator()(const base_ } base_device::memory::synchronize_memory_op()( - d, - d, output_matrix, device_temp, row * col); - base_device::memory::delete_memory_op()(d, device_temp); + base_device::memory::delete_memory_op()(device_temp); } template <> @@ -924,7 +922,7 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator( std::complex* output_matrix) { std::complex* device_temp = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(d, device_temp, row * col); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(device_temp, row * col); if (row == col) { @@ -947,13 +945,11 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator( } base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()( - d, - d, output_matrix, device_temp, row * col); - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(d, device_temp); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(device_temp); cudaCheckOnDebug(); @@ -968,7 +964,7 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator std::complex* output_matrix) { std::complex* device_temp = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(d, device_temp, row * col); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(device_temp, row * col); if (row == col) { @@ -989,9 +985,9 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, - base_device::DEVICE_GPU>()(d, d, output_matrix, device_temp, row * col); + base_device::DEVICE_GPU>()(output_matrix, device_temp, row * col); - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(d, device_temp); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(device_temp); } template <> diff --git a/source/module_hsolver/kernels/math_kernel_op.cpp b/source/module_hsolver/kernels/math_kernel_op.cpp index 3a752c3659..db2a12e9db 100644 --- a/source/module_hsolver/kernels/math_kernel_op.cpp +++ b/source/module_hsolver/kernels/math_kernel_op.cpp @@ -323,7 +323,7 @@ struct matrixTranspose_op T* output_matrix) { T* temp = nullptr; - base_device::memory::resize_memory_op()(d, temp, row * col, "MTransOp"); + base_device::memory::resize_memory_op()(temp, row * col, "MTransOp"); #ifdef _OPENMP #pragma omp parallel for collapse(2) schedule(static, 8192 / sizeof(T)) #endif @@ -341,7 +341,7 @@ struct matrixTranspose_op { output_matrix[i] = temp[i]; } - base_device::memory::delete_memory_op()(d, temp); + base_device::memory::delete_memory_op()(temp); } }; diff --git a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu index ef5a1c1ece..1993ae4c64 100644 --- a/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu +++ b/source/module_hsolver/kernels/rocm/math_kernel_op.hip.cu @@ -806,7 +806,7 @@ void matrixTranspose_op::operator()(const base_ double* output_matrix) { double* device_temp = nullptr; - base_device::memory::resize_memory_op()(d, device_temp, row * col); + base_device::memory::resize_memory_op()(device_temp, row * col); if (row == col) { @@ -823,13 +823,11 @@ void matrixTranspose_op::operator()(const base_ } base_device::memory::synchronize_memory_op()( - d, - d, output_matrix, device_temp, row * col); - base_device::memory::delete_memory_op()(d, device_temp); + base_device::memory::delete_memory_op()(device_temp); } template <> @@ -841,7 +839,7 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator( std::complex* output_matrix) { std::complex* device_temp = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(d, device_temp, row * col); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(device_temp, row * col); if (row == col) { @@ -863,13 +861,11 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator( } base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()( - d, - d, output_matrix, device_temp, row * col); - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(d, device_temp); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(device_temp); } template <> @@ -881,7 +877,7 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator std::complex* output_matrix) { std::complex* device_temp = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(d, device_temp, row * col); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(device_temp, row * col); if (row == col) { @@ -898,9 +894,9 @@ void matrixTranspose_op, base_device::DEVICE_GPU>::operator base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, - base_device::DEVICE_GPU>()(d, d, output_matrix, device_temp, row * col); + base_device::DEVICE_GPU>()(output_matrix, device_temp, row * col); - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(d, device_temp); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(device_temp); } template <> diff --git a/source/module_hsolver/kernels/test/math_dngvd_test.cpp b/source/module_hsolver/kernels/test/math_dngvd_test.cpp index 8b614ae9a0..a67b18d4be 100644 --- a/source/module_hsolver/kernels/test/math_dngvd_test.cpp +++ b/source/module_hsolver/kernels/test/math_dngvd_test.cpp @@ -140,8 +140,8 @@ TEST_F(TestModuleHsolverMathDngvd, transpose_gpu) // {-0.351417,-1.73472}, {-8.32667,2.3744}, {4.16334,3.64292} }; std::complex* device_transpose = nullptr; - resize_memory_op_Z()(gpu_ctx, device_transpose, matrix_size); - synchronize_memory_op_C2G_Z()(gpu_ctx, cpu_ctx, device_transpose, transpose.data(), transpose.size()); + resize_memory_op_Z()(device_transpose, matrix_size); + synchronize_memory_op_C2G_Z()(device_transpose, transpose.data(), transpose.size()); // run hsolver::createGpuBlasHandle(); @@ -162,7 +162,7 @@ TEST_F(TestModuleHsolverMathDngvd, transpose_gpu) {0.0, 0.0}, // {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0} }; - synchronize_memory_op_G2C_Z()(cpu_ctx, gpu_ctx, transpose_result.data(), device_transpose, transpose.size()); + synchronize_memory_op_G2C_Z()(transpose_result.data(), device_transpose, transpose.size()); // std::vector > test_result = { // {-0.351417,-1.73472}, {-0.351417,-1.73472}, {-0.351417,-1.73472}, diff --git a/source/module_hsolver/kernels/test/math_kernel_test.cpp b/source/module_hsolver/kernels/test/math_kernel_test.cpp index c2c66fb936..0781d54787 100644 --- a/source/module_hsolver/kernels/test/math_kernel_test.cpp +++ b/source/module_hsolver/kernels/test/math_kernel_test.cpp @@ -371,16 +371,16 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_cpu) TEST_F(TestModuleHsolverMathKernel, zdot_real_op_gpu) { std::complex*psi_L_dev = NULL, *psi_R_dev = NULL; - resize_memory_op()(gpu_ctx, psi_L_dev, psi_L.size()); - resize_memory_op()(gpu_ctx, psi_R_dev, psi_R.size()); - synchronize_memory_op()(gpu_ctx, cpu_ctx, psi_L_dev, psi_L.data(), psi_L.size()); - synchronize_memory_op()(gpu_ctx, cpu_ctx, psi_R_dev, psi_R.data(), psi_R.size()); + resize_memory_op()(psi_L_dev, psi_L.size()); + resize_memory_op()(psi_R_dev, psi_R.size()); + synchronize_memory_op()(psi_L_dev, psi_L.data(), psi_L.size()); + synchronize_memory_op()(psi_R_dev, psi_R.data(), psi_R.size()); hsolver::createGpuBlasHandle(); double result = zdot_real_gpu_op()(gpu_ctx, dim, psi_L_dev, psi_R_dev, false); hsolver::destoryBLAShandle(); EXPECT_LT(fabs(result - expected_result), 1e-12); - delete_memory_op()(gpu_ctx, psi_L_dev); - delete_memory_op()(gpu_ctx, psi_R_dev); + delete_memory_op()(psi_L_dev); + delete_memory_op()(psi_R_dev); } TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu) @@ -390,22 +390,22 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_constant_op_gpu) // in GPU std::complex* input_dev = NULL; std::complex* output_dev = NULL; - resize_memory_op()(gpu_ctx, input_dev, input.size()); - resize_memory_op()(gpu_ctx, output_dev, input.size()); + resize_memory_op()(input_dev, input.size()); + resize_memory_op()(output_dev, input.size()); // syn the input data in CPU to GPU - synchronize_memory_op()(gpu_ctx, cpu_ctx, input_dev, input.data(), input.size()); + synchronize_memory_op()(input_dev, input.data(), input.size()); // run vector_div_constant_op_gpu()(gpu_ctx, dim, output_dev, input_dev, constant); // syn the output data in GPU to CPU - synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size()); + synchronize_memory_op_gpu()(output.data(), output_dev, output.size()); for (int i = 0; i < input.size(); i++) { EXPECT_LT(fabs(output[i].imag() - output_vector_div_constant_op[i].imag()), 1e-8); EXPECT_LT(fabs(output[i].real() - output_vector_div_constant_op[i].real()), 1e-8); } - delete_memory_op()(gpu_ctx, input_dev); - delete_memory_op()(gpu_ctx, output_dev); + delete_memory_op()(input_dev); + delete_memory_op()(output_dev); } TEST_F(TestModuleHsolverMathKernel, vector_mul_vector_op_gpu) @@ -419,19 +419,19 @@ TEST_F(TestModuleHsolverMathKernel, vector_mul_vector_op_gpu) std::complex* output_dev = NULL; // resize memory for values - resize_memory_op()(gpu_ctx, input_dev, input.size()); - resize_memory_op_double()(gpu_ctx, input_double_dev, input.size()); - resize_memory_op()(gpu_ctx, output_dev, input.size()); + resize_memory_op()(input_dev, input.size()); + resize_memory_op_double()(input_double_dev, input.size()); + resize_memory_op()(output_dev, input.size()); // syn the input data in CPU to GPU - synchronize_memory_op()(gpu_ctx, cpu_ctx, input_dev, input.data(), input.size()); - synchronize_memory_op_double()(gpu_ctx, cpu_ctx, input_double_dev, input_double.data(), input.size()); + synchronize_memory_op()(input_dev, input.data(), input.size()); + synchronize_memory_op_double()(input_double_dev, input_double.data(), input.size()); // run vector_mul_vector_op_gpu()(gpu_ctx, dim, output_dev, input_dev, input_double_dev); // syn the output data in GPU to CPU - synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size()); + synchronize_memory_op_gpu()(output.data(), output_dev, output.size()); for (int i = 0; i < input.size(); i++) { @@ -439,9 +439,9 @@ TEST_F(TestModuleHsolverMathKernel, vector_mul_vector_op_gpu) EXPECT_LT(fabs(output[i].real() - output_vector_mul_vector_op[i].real()), 1e-8); } - delete_memory_op()(gpu_ctx, input_dev); - delete_memory_op_double()(gpu_ctx, input_double_dev); - delete_memory_op()(gpu_ctx, output_dev); + delete_memory_op()(input_dev); + delete_memory_op_double()(input_double_dev); + delete_memory_op()(output_dev); } TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu) @@ -455,19 +455,19 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu) std::complex* output_dev = NULL; // resize memory for values in GPU - resize_memory_op()(gpu_ctx, input_dev, input.size()); - resize_memory_op_double()(gpu_ctx, input_double_dev, input.size()); - resize_memory_op()(gpu_ctx, output_dev, input.size()); + resize_memory_op()(input_dev, input.size()); + resize_memory_op_double()(input_double_dev, input.size()); + resize_memory_op()(output_dev, input.size()); // syn the input data in CPU to GPU - synchronize_memory_op()(gpu_ctx, cpu_ctx, input_dev, input.data(), input.size()); - synchronize_memory_op_double()(gpu_ctx, cpu_ctx, input_double_dev, input_double.data(), input.size()); + synchronize_memory_op()(input_dev, input.data(), input.size()); + synchronize_memory_op_double()(input_double_dev, input_double.data(), input.size()); // run vector_div_vector_op_gpu()(gpu_ctx, dim, output_dev, input_dev, input_double_dev); // syn the output data in GPU to CPU - synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size()); + synchronize_memory_op_gpu()(output.data(), output_dev, output.size()); for (int i = 0; i < input.size(); i++) { @@ -475,9 +475,9 @@ TEST_F(TestModuleHsolverMathKernel, vector_div_vector_op_gpu) EXPECT_LT(fabs(output[i].real() - output_vector_div_vector_op[i].real()), 1e-8); } - delete_memory_op()(gpu_ctx, input_dev); - delete_memory_op_double()(gpu_ctx, input_double_dev); - delete_memory_op()(gpu_ctx, output_dev); + delete_memory_op()(input_dev); + delete_memory_op_double()(input_double_dev); + delete_memory_op()(output_dev); } TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gpu) @@ -491,13 +491,13 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp std::complex* output_dev = NULL; // resize memory for values in GPU - resize_memory_op()(gpu_ctx, input1_dev, input.size()); - resize_memory_op()(gpu_ctx, input2_dev, input.size()); - resize_memory_op()(gpu_ctx, output_dev, input.size()); + resize_memory_op()(input1_dev, input.size()); + resize_memory_op()(input2_dev, input.size()); + resize_memory_op()(output_dev, input.size()); // syn the input data in CPU to GPU - synchronize_memory_op()(gpu_ctx, cpu_ctx, input1_dev, input1.data(), input.size()); - synchronize_memory_op()(gpu_ctx, cpu_ctx, input2_dev, input2.data(), input.size()); + synchronize_memory_op()(input1_dev, input1.data(), input.size()); + synchronize_memory_op()(input2_dev, input2.data(), input.size()); // run constantvector_addORsub_constantVector_op_gpu()(gpu_ctx, @@ -509,7 +509,7 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp constant2); // syn the output data in GPU to CPU - synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, output.data(), output_dev, output.size()); + synchronize_memory_op_gpu()(output.data(), output_dev, output.size()); for (int i = 0; i < input.size(); i++) { @@ -517,9 +517,9 @@ TEST_F(TestModuleHsolverMathKernel, constantvector_addORsub_constantVector_op_gp EXPECT_LT(fabs(output[i].real() - output_constantvector_addORsub_constantVector_op[i].real()), 1e-8); } - delete_memory_op()(gpu_ctx, input1_dev); - delete_memory_op()(gpu_ctx, input2_dev); - delete_memory_op()(gpu_ctx, output_dev); + delete_memory_op()(input1_dev); + delete_memory_op()(input2_dev); + delete_memory_op()(output_dev); } TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu) @@ -529,12 +529,12 @@ TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu) std::complex* Y_axpy_dev = NULL; // resize memory for values in GPU - resize_memory_op()(gpu_ctx, X_axpy_dev, X_axpy.size()); - resize_memory_op()(gpu_ctx, Y_axpy_dev, Y_axpy.size()); + resize_memory_op()(X_axpy_dev, X_axpy.size()); + resize_memory_op()(Y_axpy_dev, Y_axpy.size()); // syn the input data in CPU to GPU - synchronize_memory_op()(gpu_ctx, cpu_ctx, X_axpy_dev, X_axpy.data(), X_axpy.size()); - synchronize_memory_op()(gpu_ctx, cpu_ctx, Y_axpy_dev, Y_axpy.data(), Y_axpy.size()); + synchronize_memory_op()(X_axpy_dev, X_axpy.data(), X_axpy.size()); + synchronize_memory_op()(Y_axpy_dev, Y_axpy.data(), Y_axpy.size()); // run hsolver::createGpuBlasHandle(); @@ -542,7 +542,7 @@ TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu) hsolver::destoryBLAShandle(); // syn the output data in GPU to CPU - synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, Y_axpy.data(), Y_axpy_dev, Y_axpy.size()); + synchronize_memory_op_gpu()(Y_axpy.data(), Y_axpy_dev, Y_axpy.size()); for (int i = 0; i < input.size(); i++) { @@ -550,8 +550,8 @@ TEST_F(TestModuleHsolverMathKernel, axpy_op_gpu) EXPECT_LT(fabs(Y_axpy[i].real() - output_axpy_op[i].real()), 1e-8); } - delete_memory_op()(gpu_ctx, X_axpy_dev); - delete_memory_op()(gpu_ctx, Y_axpy_dev); + delete_memory_op()(X_axpy_dev); + delete_memory_op()(Y_axpy_dev); } TEST_F(TestModuleHsolverMathKernel, scal_op_gpu) @@ -560,10 +560,10 @@ TEST_F(TestModuleHsolverMathKernel, scal_op_gpu) std::complex* X_scal_dev = NULL; // resize memory for values in GPU - resize_memory_op()(gpu_ctx, X_scal_dev, X_scal.size()); + resize_memory_op()(X_scal_dev, X_scal.size()); // syn the input data in CPU to GPU - synchronize_memory_op()(gpu_ctx, cpu_ctx, X_scal_dev, X_scal.data(), X_scal.size()); + synchronize_memory_op()(X_scal_dev, X_scal.data(), X_scal.size()); // run hsolver::createGpuBlasHandle(); @@ -571,14 +571,14 @@ TEST_F(TestModuleHsolverMathKernel, scal_op_gpu) hsolver::destoryBLAShandle(); // syn the output data in GPU to CPU - synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, X_scal.data(), X_scal_dev, X_scal.size()); + synchronize_memory_op_gpu()(X_scal.data(), X_scal_dev, X_scal.size()); for (int i = 0; i < input.size(); i++) { EXPECT_LT(fabs(X_scal[i].imag() - output_scal_op[i].imag()), 1e-8); EXPECT_LT(fabs(X_scal[i].real() - output_scal_op[i].real()), 1e-8); } - delete_memory_op()(gpu_ctx, X_scal_dev); + delete_memory_op()(X_scal_dev); } TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu) @@ -589,21 +589,21 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu) std::complex* Y_gemv_dev = NULL; // resize memory for values in GPU - resize_memory_op()(gpu_ctx, A_gemv_dev, A_gemv.size()); - resize_memory_op()(gpu_ctx, X_gemv_dev, X_gemv.size()); - resize_memory_op()(gpu_ctx, Y_gemv_dev, Y_gemv.size()); + resize_memory_op()(A_gemv_dev, A_gemv.size()); + resize_memory_op()(X_gemv_dev, X_gemv.size()); + resize_memory_op()(Y_gemv_dev, Y_gemv.size()); // syn the input data in CPU to GPU - synchronize_memory_op()(gpu_ctx, cpu_ctx, A_gemv_dev, A_gemv.data(), A_gemv.size()); - synchronize_memory_op()(gpu_ctx, cpu_ctx, X_gemv_dev, X_gemv.data(), X_gemv.size()); - synchronize_memory_op()(gpu_ctx, cpu_ctx, Y_gemv_dev, Y_gemv.data(), Y_gemv.size()); + synchronize_memory_op()(A_gemv_dev, A_gemv.data(), A_gemv.size()); + synchronize_memory_op()(X_gemv_dev, X_gemv.data(), X_gemv.size()); + synchronize_memory_op()(Y_gemv_dev, Y_gemv.data(), Y_gemv.size()); // run hsolver::createGpuBlasHandle(); gemv_op_gpu()(gpu_ctx, 'C', 2, 3, &ModuleBase::ONE, A_gemv_dev, 2, X_gemv_dev, 1, &ModuleBase::ONE, Y_gemv_dev, 1); hsolver::destoryBLAShandle(); // syn the output data in GPU to CPU - synchronize_memory_op_gpu()(cpu_ctx, gpu_ctx, Y_gemv.data(), Y_gemv_dev, Y_gemv.size()); + synchronize_memory_op_gpu()(Y_gemv.data(), Y_gemv_dev, Y_gemv.size()); // cal right answer: Y_test_gemv char trans = 'C'; @@ -628,9 +628,9 @@ TEST_F(TestModuleHsolverMathKernel, gemv_op_gpu) EXPECT_LT(fabs(Y_gemv[i].real() - Y_test_gemv[i].real()), 1e-12); } - delete_memory_op()(gpu_ctx, A_gemv_dev); - delete_memory_op()(gpu_ctx, X_gemv_dev); - delete_memory_op()(gpu_ctx, Y_gemv_dev); + delete_memory_op()(A_gemv_dev); + delete_memory_op()(X_gemv_dev); + delete_memory_op()(Y_gemv_dev); } TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu) @@ -654,20 +654,16 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu) int LDB = 4; std::complex* device_A = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(gpu_ctx, device_A, A.size()); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(device_A, A.size()); base_device::memory:: - synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(gpu_ctx, - cpu_ctx, - device_A, + synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(device_A, A.data(), A.size()); std::complex* device_B = nullptr; - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(gpu_ctx, device_B, B.size()); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(device_B, B.size()); base_device::memory:: - synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(gpu_ctx, - cpu_ctx, - device_B, + synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(device_B, B.data(), B.size()); @@ -682,9 +678,7 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu) std::vector> B_gpu2cpu(8); base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, - base_device::DEVICE_GPU>()(cpu_ctx, - gpu_ctx, - B_gpu2cpu.data(), + base_device::DEVICE_GPU>()(B_gpu2cpu.data(), device_B, B_gpu2cpu.size()); @@ -721,8 +715,8 @@ TEST_F(TestModuleHsolverMathKernel, matrixSetToAnother_op_gpu) EXPECT_LT(fabs(B_gpu2cpu[i].real() - B_cpu[i].real()), 1e-12); } - delete_memory_op()(gpu_ctx, device_A); - delete_memory_op()(gpu_ctx, device_B); + delete_memory_op()(device_A); + delete_memory_op()(device_B); } #endif // __UT_USE_CUDA || __UT_USE_ROCM diff --git a/source/module_hsolver/kernels/test/perf_math_kernel.cpp b/source/module_hsolver/kernels/test/perf_math_kernel.cpp index 173ef8b40b..b2b0704a9d 100644 --- a/source/module_hsolver/kernels/test/perf_math_kernel.cpp +++ b/source/module_hsolver/kernels/test/perf_math_kernel.cpp @@ -105,14 +105,14 @@ class PerfModuleHsolverMathKernel : public benchmark::Fixture { zconstant_a = std::complex{(double)rand()+(double)rand()/(RAND_MAX+1.0),(double)rand()+(double)rand()/(RAND_MAX+1.0)}; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM - resize_memory_op()(gpu_ctx, test_zvector_a_gpu, dim_vector); - resize_memory_op()(gpu_ctx, test_zvector_b_gpu, dim_vector); - synchronize_memory_op()(gpu_ctx, cpu_ctx, test_zvector_a_gpu, test_zvector_a, dim_vector); - synchronize_memory_op()(gpu_ctx, cpu_ctx, test_zvector_b_gpu, test_zvector_b, dim_vector); - - resize_memory_op()(gpu_ctx, result_zvector_gpu, dim_vector); - resize_memory_op_double()(gpu_ctx, test_dvector_a_gpu, dim_vector); - synchronize_memory_op_double()(gpu_ctx, cpu_ctx, test_dvector_a_gpu, test_dvector_a, dim_vector); + resize_memory_op()(test_zvector_a_gpu, dim_vector); + resize_memory_op()(test_zvector_b_gpu, dim_vector); + synchronize_memory_op()(test_zvector_a_gpu, test_zvector_a, dim_vector); + synchronize_memory_op()(test_zvector_b_gpu, test_zvector_b, dim_vector); + + resize_memory_op()(result_zvector_gpu, dim_vector); + resize_memory_op_double()(test_dvector_a_gpu, dim_vector); + synchronize_memory_op_double()(test_dvector_a_gpu, test_dvector_a, dim_vector); hsolver::createGpuBlasHandle(); diff --git a/source/module_hsolver/test/hsolver_pw_sup.h b/source/module_hsolver/test/hsolver_pw_sup.h index fcb2862a29..c61ffaca7d 100644 --- a/source/module_hsolver/test/hsolver_pw_sup.h +++ b/source/module_hsolver/test/hsolver_pw_sup.h @@ -139,15 +139,13 @@ DiagoDavid::DiagoDavid(const Real* precondition_in, template DiagoDavid::~DiagoDavid() { - delmem_complex_op()(this->ctx, this->hpsi); - delmem_complex_op()(this->ctx, this->spsi); - delmem_complex_op()(this->ctx, this->hcc); - delmem_complex_op()(this->ctx, this->scc); - delmem_complex_op()(this->ctx, this->vcc); - delmem_complex_op()(this->ctx, this->lagrange_matrix); - base_device::memory::delete_memory_op()( - this->cpu_ctx, - this->eigenvalue); + delmem_complex_op()(this->hpsi); + delmem_complex_op()(this->spsi); + delmem_complex_op()(this->hcc); + delmem_complex_op()(this->scc); + delmem_complex_op()(this->vcc); + delmem_complex_op()(this->lagrange_matrix); + base_device::memory::delete_memory_op()(this->eigenvalue); } template diff --git a/source/module_psi/psi.cpp b/source/module_psi/psi.cpp index 7942b412c9..a69635dffb 100644 --- a/source/module_psi/psi.cpp +++ b/source/module_psi/psi.cpp @@ -40,7 +40,7 @@ Psi::~Psi() { if (this->allocate_inside) { - delete_memory_op()(this->ctx, this->psi); + delete_memory_op()(this->psi); } } @@ -58,7 +58,7 @@ Psi::Psi(const int nk_in, const int nbd_in, const int nbs_in, const i this->ngk = ngk_in; // modify later // This function will delete the psi array first(if psi exist), then malloc a new memory for it. - resize_memory_op()(this->ctx, this->psi, nk_in * static_cast(nbd_in) * nbs_in, "no_record"); + resize_memory_op()(this->psi, nk_in * static_cast(nbd_in) * nbs_in, "no_record"); this->nk = nk_in; this->nbands = nbd_in; @@ -96,7 +96,7 @@ Psi::Psi(const int nk_in, this->ngk = ngk_in.data(); // modify later // This function will delete the psi array first(if psi exist), then malloc a new memory for it. - resize_memory_op()(this->ctx, this->psi, nk_in * static_cast(nbd_in) * nbs_in, "no_record"); + resize_memory_op()(this->psi, nk_in * static_cast(nbd_in) * nbs_in, "no_record"); this->nk = nk_in; this->nbands = nbd_in; @@ -166,7 +166,7 @@ Psi::Psi(const int nk_in, this->ngk = nullptr; assert(nk_in > 0 && nbd_in >= 0 && nbs_in > 0); - resize_memory_op()(this->ctx, this->psi, nk_in * static_cast(nbd_in) * nbs_in, "no_record"); + resize_memory_op()(this->psi, nk_in * static_cast(nbd_in) * nbs_in, "no_record"); this->nk = nk_in; this->nbands = nbd_in; @@ -201,9 +201,7 @@ Psi::Psi(const Psi& psi_in) // this function will copy psi_in.psi to this->psi no matter the device types of each other. this->resize(psi_in.get_nk(), psi_in.get_nbands(), psi_in.get_nbasis()); - base_device::memory::synchronize_memory_op()(this->ctx, - psi_in.get_device(), - this->psi, + base_device::memory::synchronize_memory_op()(this->psi, psi_in.get_pointer() - psi_in.get_psi_bias(), psi_in.size()); this->psi_bias = psi_in.get_psi_bias(); @@ -238,25 +236,19 @@ Psi::Psi(const Psi& psi_in) { auto* arr = (T*)malloc(sizeof(T) * psi_in.size()); // cast the memory from T_in to T in CPU - base_device::memory::cast_memory_op()(psi_in.get_device(), - psi_in.get_device(), - arr, + base_device::memory::cast_memory_op()(arr, psi_in.get_pointer() - psi_in.get_psi_bias(), psi_in.size()); // synchronize the memory from CPU to GPU - base_device::memory::synchronize_memory_op()(this->ctx, - psi_in.get_device(), - this->psi, + base_device::memory::synchronize_memory_op()(this->psi, arr, psi_in.size()); free(arr); } else { - base_device::memory::cast_memory_op()(this->ctx, - psi_in.get_device(), - this->psi, + base_device::memory::cast_memory_op()(this->psi, psi_in.get_pointer() - psi_in.get_psi_bias(), psi_in.size()); } @@ -269,7 +261,7 @@ template void Psi::set_all_psi(const T* another_pointer, const std::size_t size_in) { assert(size_in == this->size()); - synchronize_memory_op()(this->ctx, this->ctx, this->psi, another_pointer, this->size()); + synchronize_memory_op()(this->psi, another_pointer, this->size()); } template @@ -278,7 +270,7 @@ void Psi::resize(const int nks_in, const int nbands_in, const int nba assert(nks_in > 0 && nbands_in >= 0 && nbasis_in > 0); // This function will delete the psi array first(if psi exist), then malloc a new memory for it. - resize_memory_op()(this->ctx, this->psi, nks_in * static_cast(nbands_in) * nbasis_in, "no_record"); + resize_memory_op()(this->psi, nks_in * static_cast(nbands_in) * nbasis_in, "no_record"); // this->zero_out(); @@ -496,7 +488,7 @@ template void Psi::zero_out() { // this->psi.assign(this->psi.size(), T(0)); - set_memory_op()(this->ctx, this->psi, 0, this->size()); + set_memory_op()(this->psi, 0, this->size()); } template diff --git a/source/module_psi/psi_init.cpp b/source/module_psi/psi_init.cpp index 2cdce4a5a8..102e2d4b1a 100644 --- a/source/module_psi/psi_init.cpp +++ b/source/module_psi/psi_init.cpp @@ -139,7 +139,7 @@ void PSIInit::initialize_psi(Psi>* psi, this->psi_initer->init_psig(psi_cpu->get_pointer(), ik); if (psi_device->get_pointer() != psi_cpu->get_pointer()) { - syncmem_h2d_op()(ctx, cpu_ctx, psi_device->get_pointer(), psi_cpu->get_pointer(), nbands_start * nbasis); + syncmem_h2d_op()(psi_device->get_pointer(), psi_cpu->get_pointer(), nbands_start * nbasis); } std::vector::type> etatom(nbands_start, 0.0); @@ -170,7 +170,7 @@ void PSIInit::initialize_psi(Psi>* psi, { if (psi_device->get_pointer() != kspw_psi->get_pointer()) { - syncmem_complex_op()(ctx, ctx, kspw_psi->get_pointer(), psi_device->get_pointer(), nbands * nbasis); + syncmem_complex_op()(kspw_psi->get_pointer(), psi_device->get_pointer(), nbands * nbasis); } } } // end k-point loop