Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions source/source_io/module_parameter/read_input_item_system.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ Available options are:
* double: double precision
* mix: mixed precision, starting from single precision and switching to double precision when the SCF residual becomes small enough)";
item.default_value = "double";
item.availability = "Used only for LCAO basis set on CPU.";
item.availability = "Used only for LCAO basis set.";
read_sync_string(input.gint_precision);
item.check_value = [](const Input_Item& item, const Parameter& para) {
std::vector<std::string> avail_list = {"single", "double", "mix"};
Expand All @@ -707,12 +707,11 @@ Available options are:
const std::string warningstr = nofound_str(avail_list, "gint_precision");
ModuleBase::WARNING_QUIT("ReadInput", warningstr);
}
if (para.inp.gint_precision != "double"
&& (para.inp.basis_type != "lcao" || para.inp.device != "cpu"))
if (para.inp.gint_precision != "double" && para.inp.basis_type != "lcao")
{
ModuleBase::WARNING_QUIT(
"ReadInput",
"gint_precision = single or mix is currently supported only for CPU LCAO calculations.\n");
"gint_precision = single or mix is currently supported only for LCAO calculations.\n");
}
if (para.inp.gint_precision != "double" && para.inp.nspin == 4)
{
Expand Down
2 changes: 1 addition & 1 deletion source/source_lcao/module_gint/gint_fvl_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ void Gint_fvl_gpu::cal_fvl_svl_()
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
PhiOperatorGpu<double> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false);
Expand Down
2 changes: 1 addition & 1 deletion source/source_lcao/module_gint/gint_fvl_meta_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ void Gint_fvl_meta_gpu::cal_fvl_svl_()
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
PhiOperatorGpu<double> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3_dm(BatchBigGrid::get_max_phi_len(), stream, false);
Expand Down
80 changes: 46 additions & 34 deletions source/source_lcao/module_gint/gint_rho_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,63 +5,63 @@
#include "kernel/phi_operator_gpu.h"
#include "source_base/module_device/device_check.h"

#include <algorithm>

namespace ModuleGint
{

void Gint_rho_gpu::cal_gint()
{
ModuleBase::TITLE("Gint", "cal_gint_rho");
ModuleBase::timer::start("Gint", "cal_gint_rho");
init_dm_gint_();
transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec_);
cal_rho_();
ModuleBase::timer::end("Gint", "cal_gint_rho");
}

void Gint_rho_gpu::init_dm_gint_()
{
dm_gint_vec_.resize(nspin_);
for (int is = 0; is < nspin_; is++)
switch (gint_info_->get_exec_precision())
{
dm_gint_vec_[is] = gint_info_->get_hr<double>();
case GintPrecision::fp32:
cal_gint_impl_<float>();
break;
case GintPrecision::fp64:
default:
cal_gint_impl_<double>();
break;
}
ModuleBase::timer::end("Gint", "cal_gint_rho");
}

void Gint_rho_gpu::transfer_cpu_to_gpu_()
template<typename Real>
void Gint_rho_gpu::cal_gint_impl_()
{
dm_gint_d_vec_.resize(nspin_);
rho_d_vec_.resize(nspin_);
// 1. Initialize dm_gint as HContainer<Real>
std::vector<HContainer<Real>> dm_gint_vec(nspin_);
for (int is = 0; is < nspin_; is++)
{
dm_gint_d_vec_[is] = CudaMemWrapper<double>(dm_gint_vec_[is].get_nnr(), 0, false);
rho_d_vec_[is] = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
CHECK_CUDA(cudaMemcpy(dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is].get_wrapper(),
dm_gint_vec_[is].get_nnr() * sizeof(double), cudaMemcpyHostToDevice));
dm_gint_vec[is] = gint_info_->get_hr<Real>();
}
}

void Gint_rho_gpu::transfer_gpu_to_cpu_()
{
// 2. Transfer dm from 2D parallel distribution to gint serial distribution
transfer_dm_2d_to_gint(*gint_info_, dm_vec_, dm_gint_vec);

// 3. Transfer dm to GPU
std::vector<CudaMemWrapper<Real>> dm_gint_d_vec(nspin_);
std::vector<CudaMemWrapper<Real>> rho_d_vec(nspin_);
for (int is = 0; is < nspin_; is++)
{
CHECK_CUDA(cudaMemcpy(rho_[is], rho_d_vec_[is].get_device_ptr(),
gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyDeviceToHost));
dm_gint_d_vec[is] = CudaMemWrapper<Real>(dm_gint_vec[is].get_nnr(), 0, false);
rho_d_vec[is] = CudaMemWrapper<Real>(gint_info_->get_local_mgrid_num(), 0, false);
CHECK_CUDA(cudaMemcpy(dm_gint_d_vec[is].get_device_ptr(), dm_gint_vec[is].get_wrapper(),
dm_gint_vec[is].get_nnr() * sizeof(Real), cudaMemcpyHostToDevice));
}
}

void Gint_rho_gpu::cal_rho_()
{
transfer_cpu_to_gpu_();
// 4. Calculate rho on GPU
#pragma omp parallel num_threads(gint_info_->get_streams_num())
{
// 20240620 Note that it must be set again here because
// 20240620 Note that it must be set again here because
// cuda's device is not safe in a multi-threaded environment.
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_dm(BatchBigGrid::get_max_phi_len(), stream, false);
PhiOperatorGpu<Real> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<Real> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<Real> phi_dm(BatchBigGrid::get_max_phi_len(), stream, false);
#pragma omp for schedule(dynamic)
for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i)
{
Expand All @@ -74,15 +74,27 @@ void Gint_rho_gpu::cal_rho_()
phi_op.set_phi(phi.get_device_ptr());
for(int is = 0; is < nspin_; is++)
{
phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec_[is].get_device_ptr(), dm_gint_vec_[is],
phi_op.phi_mul_dm(phi.get_device_ptr(), dm_gint_d_vec[is].get_device_ptr(), dm_gint_vec[is],
is_dm_symm_, phi_dm.get_device_ptr());
phi_op.phi_dot_phi(phi.get_device_ptr(), phi_dm.get_device_ptr(), rho_d_vec_[is].get_device_ptr());
phi_op.phi_dot_phi(phi.get_device_ptr(), phi_dm.get_device_ptr(), rho_d_vec[is].get_device_ptr());
}
}
CHECK_CUDA(cudaStreamSynchronize(stream));
CHECK_CUDA(cudaStreamDestroy(stream));
}
transfer_gpu_to_cpu_();

// 5. Transfer rho back to CPU and convert to double if needed
const int local_mgrid_num = gint_info_->get_local_mgrid_num();
for (int is = 0; is < nspin_; is++)
{
std::vector<Real> rho_tmp(local_mgrid_num);
CHECK_CUDA(cudaMemcpy(rho_tmp.data(), rho_d_vec[is].get_device_ptr(),
local_mgrid_num * sizeof(Real), cudaMemcpyDeviceToHost));
for (int ir = 0; ir < local_mgrid_num; ++ir)
{
rho_[is][ir] = static_cast<double>(rho_tmp[ir]);
}
}
}

} // namespace ModuleGint
15 changes: 2 additions & 13 deletions source/source_lcao/module_gint/gint_rho_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,8 @@ class Gint_rho_gpu: public Gint
void cal_gint();

private:
void init_dm_gint_();

void cal_rho_();

void transfer_cpu_to_gpu_();

void transfer_gpu_to_cpu_();
template<typename Real>
void cal_gint_impl_();

// input
const std::vector<HContainer<double>*> dm_vec_;
Expand All @@ -41,12 +36,6 @@ class Gint_rho_gpu: public Gint

// output
double ** rho_ = nullptr;

// Intermediate variables
std::vector<HContainer<double>> dm_gint_vec_;

std::vector<CudaMemWrapper<double>> dm_gint_d_vec_;
std::vector<CudaMemWrapper<double>> rho_d_vec_;
};

}
2 changes: 1 addition & 1 deletion source/source_lcao/module_gint/gint_tau_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void Gint_tau_gpu::cal_tau_()
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
PhiOperatorGpu<double> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> dphi_y(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> dphi_z(BatchBigGrid::get_max_phi_len(), stream, false);
Expand Down
87 changes: 62 additions & 25 deletions source/source_lcao/module_gint/gint_vl_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,52 +5,83 @@
#include "kernel/phi_operator_gpu.h"
#include "source_base/module_device/device_check.h"

#include <algorithm>
#include <type_traits>

namespace ModuleGint
{

void Gint_vl_gpu::cal_gint()
{
ModuleBase::TITLE("Gint", "cal_gint_vl");
ModuleBase::timer::start("Gint", "cal_gint_vl");
init_hr_gint_();
cal_hr_gint_();
compose_hr_gint(hr_gint_);
transfer_hr_gint_to_hR(hr_gint_, *hR_);
switch (gint_info_->get_exec_precision())
{
case GintPrecision::fp32:
cal_gint_impl_<float>();
break;
case GintPrecision::fp64:
default:
cal_gint_impl_<double>();
break;
}
ModuleBase::timer::end("Gint", "cal_gint_vl");
}

void Gint_vl_gpu::init_hr_gint_()
// Helper: finalize hr_gint (double path — no cast needed)
inline void finalize_hr_gint_gpu_(HContainer<double>& hr_gint, HContainer<double>* hR)
{
hr_gint_ = gint_info_->get_hr<double>();
compose_hr_gint(hr_gint);
transfer_hr_gint_to_hR(hr_gint, *hR);
}

void Gint_vl_gpu::transfer_cpu_to_gpu_()
// Helper: finalize hr_gint (non-double path — cast to double first)
template<typename Real>
void finalize_hr_gint_gpu_(HContainer<Real>& hr_gint, HContainer<double>* hR)
{
hr_gint_d_ = CudaMemWrapper<double>(hr_gint_.get_nnr(), 0, false);
vr_eff_d_ = CudaMemWrapper<double>(gint_info_->get_local_mgrid_num(), 0, false);
CHECK_CUDA(cudaMemcpy(vr_eff_d_.get_device_ptr(), vr_eff_,
gint_info_->get_local_mgrid_num() * sizeof(double), cudaMemcpyHostToDevice));
HContainer<double> hr_gint_dp = make_cast_hcontainer<double>(hr_gint);
compose_hr_gint(hr_gint_dp);
transfer_hr_gint_to_hR(hr_gint_dp, *hR);
}

void Gint_vl_gpu::transfer_gpu_to_cpu_()
template<typename Real>
void Gint_vl_gpu::cal_gint_impl_()
{
CHECK_CUDA(cudaMemcpy(hr_gint_.get_wrapper(), hr_gint_d_.get_device_ptr(),
hr_gint_.get_nnr() * sizeof(double), cudaMemcpyDeviceToHost));
}
// 1. Initialize hr_gint as HContainer<Real>
HContainer<Real> hr_gint = gint_info_->get_hr<Real>();

void Gint_vl_gpu::cal_hr_gint_()
{
transfer_cpu_to_gpu_();
// 2. Convert vr_eff to Real and transfer to GPU
const int local_mgrid_num = gint_info_->get_local_mgrid_num();
CudaMemWrapper<Real> vr_eff_d(local_mgrid_num, 0, false);
CudaMemWrapper<Real> hr_gint_d(hr_gint.get_nnr(), 0, false);

if (std::is_same<Real, double>::value)
{
// No conversion needed
CHECK_CUDA(cudaMemcpy(vr_eff_d.get_device_ptr(), reinterpret_cast<const Real*>(vr_eff_),
local_mgrid_num * sizeof(Real), cudaMemcpyHostToDevice));
}
else
{
// Convert double vr_eff to Real (float)
std::vector<Real> vr_eff_buffer(local_mgrid_num);
std::transform(vr_eff_, vr_eff_ + local_mgrid_num, vr_eff_buffer.begin(),
[](const double v) { return static_cast<Real>(v); });
CHECK_CUDA(cudaMemcpy(vr_eff_d.get_device_ptr(), vr_eff_buffer.data(),
local_mgrid_num * sizeof(Real), cudaMemcpyHostToDevice));
}

// 3. Calculate hr_gint on GPU
#pragma omp parallel num_threads(gint_info_->get_streams_num())
{
// 20240620 Note that it must be set again here because
// 20240620 Note that it must be set again here because
// cuda's device is not safe in a multi-threaded environment.
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
PhiOperatorGpu<Real> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<Real> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<Real> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
#pragma omp for schedule(dynamic)
for (int i = 0; i < gint_info_->get_bgrid_batches_num(); ++i)
{
Expand All @@ -61,15 +92,21 @@ void Gint_vl_gpu::cal_hr_gint_()
}
phi_op.set_bgrid_batch(bgrid_batch);
phi_op.set_phi(phi.get_device_ptr());
phi_op.phi_mul_vldr3(vr_eff_d_.get_device_ptr(), dr3_,
phi_op.phi_mul_vldr3(vr_eff_d.get_device_ptr(), static_cast<Real>(dr3_),
phi.get_device_ptr(), phi_vldr3.get_device_ptr());
phi_op.phi_mul_phi(phi.get_device_ptr(), phi_vldr3.get_device_ptr(),
hr_gint_, hr_gint_d_.get_device_ptr());
hr_gint, hr_gint_d.get_device_ptr());
}
CHECK_CUDA(cudaStreamSynchronize(stream));
CHECK_CUDA(cudaStreamDestroy(stream));
}
transfer_gpu_to_cpu_();

// 4. Transfer hr_gint back to CPU
CHECK_CUDA(cudaMemcpy(hr_gint.get_wrapper(), hr_gint_d.get_device_ptr(),
hr_gint.get_nnr() * sizeof(Real), cudaMemcpyDeviceToHost));

// 5. Compose and transfer to hR (with cast if needed)
finalize_hr_gint_gpu_(hr_gint, hR_);
}

}
15 changes: 2 additions & 13 deletions source/source_lcao/module_gint/gint_vl_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,8 @@ class Gint_vl_gpu : public Gint
void cal_gint();

private:

void init_hr_gint_();

void transfer_cpu_to_gpu_();

void transfer_gpu_to_cpu_();

void cal_hr_gint_();
template<typename Real>
void cal_gint_impl_();

// input
const double* vr_eff_ = nullptr;
Expand All @@ -39,11 +33,6 @@ class Gint_vl_gpu : public Gint

// Intermediate variables
double dr3_;

HContainer<double> hr_gint_;

CudaMemWrapper<double> hr_gint_d_;
CudaMemWrapper<double> vr_eff_d_;
};

}
2 changes: 1 addition & 1 deletion source/source_lcao/module_gint/gint_vl_metagga_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void Gint_vl_metagga_gpu::cal_hr_gint_()
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
PhiOperatorGpu<double> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ void Gint_vl_metagga_nspin4_gpu::cal_hr_gint_()
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
PhiOperatorGpu<double> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> dphi_x(BatchBigGrid::get_max_phi_len(), stream, false);
Expand Down
2 changes: 1 addition & 1 deletion source/source_lcao/module_gint/gint_vl_nspin4_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ void Gint_vl_nspin4_gpu::cal_hr_gint_()
CHECK_CUDA(cudaSetDevice(gint_info_->get_dev_id()));
cudaStream_t stream;
CHECK_CUDA(cudaStreamCreate(&stream));
PhiOperatorGpu phi_op(gint_info_->get_gpu_vars(), stream);
PhiOperatorGpu<double> phi_op(gint_info_->get_gpu_vars(), stream);
CudaMemWrapper<double> phi(BatchBigGrid::get_max_phi_len(), stream, false);
CudaMemWrapper<double> phi_vldr3(BatchBigGrid::get_max_phi_len(), stream, false);
#pragma omp for schedule(dynamic)
Expand Down
Loading
Loading