Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions gpu4pyscf/lib/multigrid/multigrid_v2/screen.cu
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,34 @@ void put_pairs_on_blocks(

checkCudaErrors(cudaPeekAtLastError());
}

int tailor_gaussian_pairs(
int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid,
const int i_angular, const int j_angular, const int *non_trivial_pairs,
const int *i_shells, const int *j_shells, const int n_j_shells,
const int *shell_to_ao_indices,
const int *accumulated_n_pairs_per_local_grid,
const int *sorted_block_index, const int n_contributing_blocks,
const int *image_indices, const double *vectors_to_neighboring_images,
const int n_images, const int *mesh, const int *atm, const int *bas,
const double *env, const int is_non_orthogonal, const double threshold,
const int derivative_order) {
if (is_non_orthogonal) {
return gpu4pyscf::gpbc::multi_grid::tailor_gaussian_pairs_driver<true>(
sorted_pairs_per_local_grid, n_pairs_per_local_grid, i_angular,
j_angular, non_trivial_pairs, i_shells, j_shells, n_j_shells,
shell_to_ao_indices, accumulated_n_pairs_per_local_grid,
sorted_block_index, n_contributing_blocks, image_indices,
vectors_to_neighboring_images, n_images, mesh, atm, bas, env, threshold,
derivative_order);
} else {
return gpu4pyscf::gpbc::multi_grid::tailor_gaussian_pairs_driver<false>(
sorted_pairs_per_local_grid, n_pairs_per_local_grid, i_angular,
j_angular, non_trivial_pairs, i_shells, j_shells, n_j_shells,
shell_to_ao_indices, accumulated_n_pairs_per_local_grid,
sorted_block_index, n_contributing_blocks, image_indices,
vectors_to_neighboring_images, n_images, mesh, atm, bas, env, threshold,
derivative_order);
}
}
}
316 changes: 315 additions & 1 deletion gpu4pyscf/lib/multigrid/multigrid_v2/screening.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

#define EIJ_CUTOFF 60
#define BLOCK_DIM_XYZ 4
#define EXP_OVERFLOW 400
#define EXP_OVERFLOW 400

namespace gpu4pyscf::gpbc::multi_grid {

Expand Down Expand Up @@ -644,4 +644,318 @@ __global__ void put_pairs_on_blocks_kernel(
}
}

template <int i_angular, int j_angular, bool is_non_orthogonal>
__global__ static void tailor_gaussian_pairs_kernel(
int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid,
const int *non_trivial_pairs, const int *i_shells, const int *j_shells,
const int n_j_shells, const int *shell_to_ao_indices,
const int *accumulated_n_pairs_per_local_grid,
const int *sorted_block_index, const int *image_indices,
const double *vectors_to_neighboring_images, const int n_images,
const int mesh_a, const int mesh_b, const int mesh_c, const int *atm,
const int *bas, const double *env, const double threshold,
const int derivative_order) {
constexpr int n_threads = BLOCK_DIM_XYZ * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ;

const int block_index = sorted_block_index[blockIdx.x];
const int n_blocks_b = (mesh_b + BLOCK_DIM_XYZ - 1) / BLOCK_DIM_XYZ;
const int n_blocks_c = (mesh_c + BLOCK_DIM_XYZ - 1) / BLOCK_DIM_XYZ;

const int block_a_stride = n_blocks_b * n_blocks_c;
const int block_a_index = block_index / block_a_stride;
const int block_ab_index = block_index % block_a_stride;
const int block_b_index = block_ab_index / n_blocks_c;
const int block_c_index = block_ab_index % n_blocks_c;

const int a_start = block_a_index * BLOCK_DIM_XYZ;
const int b_start = block_b_index * BLOCK_DIM_XYZ;
const int c_start = block_c_index * BLOCK_DIM_XYZ;

const double start_position_x =
dxyz_dabc[0] * a_start + dxyz_dabc[3] * b_start + dxyz_dabc[6] * c_start;
const double start_position_y =
dxyz_dabc[1] * a_start + dxyz_dabc[4] * b_start + dxyz_dabc[7] * c_start;
const double start_position_z =
dxyz_dabc[2] * a_start + dxyz_dabc[5] * b_start + dxyz_dabc[8] * c_start;

const double a_dot_b = dxyz_dabc[0] * dxyz_dabc[3] +
dxyz_dabc[1] * dxyz_dabc[4] +
dxyz_dabc[2] * dxyz_dabc[5];
const double a_dot_c = dxyz_dabc[0] * dxyz_dabc[6] +
dxyz_dabc[1] * dxyz_dabc[7] +
dxyz_dabc[2] * dxyz_dabc[8];
const double b_dot_c = dxyz_dabc[3] * dxyz_dabc[6] +
dxyz_dabc[4] * dxyz_dabc[7] +
dxyz_dabc[5] * dxyz_dabc[8];

const int a_upper = min(a_start + BLOCK_DIM_XYZ, mesh_a) - a_start;
const int b_upper = min(b_start + BLOCK_DIM_XYZ, mesh_b) - b_start;
const int c_upper = min(c_start + BLOCK_DIM_XYZ, mesh_c) - c_start;

const int thread_id = threadIdx.x + threadIdx.y * BLOCK_DIM_XYZ +
threadIdx.z * BLOCK_DIM_XYZ * BLOCK_DIM_XYZ;

const int start_pair_index = accumulated_n_pairs_per_local_grid[block_index];
const int end_pair_index =
accumulated_n_pairs_per_local_grid[block_index + 1];
const int n_pairs = end_pair_index - start_pair_index;
const int n_batches = (n_pairs + n_threads - 1) / n_threads;

for (int i_batch = 0, i_pair_index = start_pair_index + thread_id;
i_batch < n_batches; i_batch++, i_pair_index += n_threads) {
const bool is_valid_pair = i_pair_index < end_pair_index;
const int i_pair =
is_valid_pair ? sorted_pairs_per_local_grid[i_pair_index] : 0;

const int image_index = image_indices[i_pair];
const int image_index_i = image_index / n_images;
const int image_index_j = image_index % n_images;

const int shell_pair_index = non_trivial_pairs[i_pair];
const int i_shell_index = shell_pair_index / n_j_shells;
const int j_shell_index = shell_pair_index % n_j_shells;
const int i_shell = i_shells[i_shell_index];
const int j_shell = j_shells[j_shell_index];

const double i_exponent = env[bas(PTR_EXP, i_shell)];
const int i_coord_offset = atm(PTR_COORD, bas(ATOM_OF, i_shell));
const double i_x =
env[i_coord_offset] + vectors_to_neighboring_images[image_index_i * 3];
const double i_y = env[i_coord_offset + 1] +
vectors_to_neighboring_images[image_index_i * 3 + 1];
const double i_z = env[i_coord_offset + 2] +
vectors_to_neighboring_images[image_index_i * 3 + 2];
const double i_coeff = env[bas(PTR_COEFF, i_shell)];

const double j_exponent = env[bas(PTR_EXP, j_shell)];
const int j_coord_offset = atm(PTR_COORD, bas(ATOM_OF, j_shell));
const double j_x =
env[j_coord_offset] + vectors_to_neighboring_images[image_index_j * 3];
const double j_y = env[j_coord_offset + 1] +
vectors_to_neighboring_images[image_index_j * 3 + 1];
const double j_z = env[j_coord_offset + 2] +
vectors_to_neighboring_images[image_index_j * 3 + 2];
const double j_coeff = env[bas(PTR_COEFF, j_shell)];

const double ij_exponent = i_exponent + j_exponent;
const double ij_exponent_in_prefactor =
i_exponent * j_exponent / ij_exponent *
distance_squared(i_x - j_x, i_y - j_y, i_z - j_z);

const double pair_x = (i_exponent * i_x + j_exponent * j_x) / ij_exponent;
const double pair_y = (i_exponent * i_y + j_exponent * j_y) / ij_exponent;
const double pair_z = (i_exponent * i_z + j_exponent * j_z) / ij_exponent;

const double x0 = start_position_x - pair_x;
const double y0 = start_position_y - pair_y;
const double z0 = start_position_z - pair_z;

const double gaussian_exponent_at_reference =
ij_exponent * distance_squared(x0, y0, z0);

const double pair_prefactor = i_coeff * j_coeff *
common_fac_sp<double, i_angular>() *
common_fac_sp<double, j_angular>();

const double gaussian_starting_point =
is_valid_pair
? exp(-(ij_exponent_in_prefactor + gaussian_exponent_at_reference) /
3.0)
: 0;

const double da_squared =
distance_squared(dxyz_dabc[0], dxyz_dabc[1], dxyz_dabc[2]);
const double db_squared =
distance_squared(dxyz_dabc[3], dxyz_dabc[4], dxyz_dabc[5]);
const double dc_squared =
distance_squared(dxyz_dabc[6], dxyz_dabc[7], dxyz_dabc[8]);

const double exp_da_squared = exp(-2 * ij_exponent * da_squared);
const double exp_db_squared = exp(-2 * ij_exponent * db_squared);
const double exp_dc_squared = exp(-2 * ij_exponent * dc_squared);

const double cross_term_a =
dxyz_dabc[0] * x0 + dxyz_dabc[1] * y0 + dxyz_dabc[2] * z0;
const double cross_term_b =
dxyz_dabc[3] * x0 + dxyz_dabc[4] * y0 + dxyz_dabc[5] * z0;
const double cross_term_c =
dxyz_dabc[6] * x0 + dxyz_dabc[7] * y0 + dxyz_dabc[8] * z0;

const double recursion_factor_a_start =
exp(-ij_exponent * (2 * cross_term_a + da_squared));
const double recursion_factor_b_start =
exp(-ij_exponent * (2 * cross_term_b + db_squared));
const double recursion_factor_c_start =
exp(-ij_exponent * (2 * cross_term_c + dc_squared));

const double exp_dadb = exp(-2 * ij_exponent * a_dot_b);
const double exp_dadc = exp(-2 * ij_exponent * a_dot_c);
const double exp_dbdc = exp(-2 * ij_exponent * b_dot_c);

int a_index, b_index, c_index;
double x, y, z;
double gaussian_x, gaussian_y, gaussian_z, recursion_factor_a,
recursion_factor_b, recursion_factor_c;
double recursion_factor_ab_pow_a = 1;
double recursion_factor_ac_pow_a = 1;
double recursion_factor_bc_pow_b = 1;

if constexpr (is_non_orthogonal) {
// recursion_factor_ab_pow_a = 1;
// recursion_factor_ac_pow_a = 1;
} else {
x = start_position_x;
}

double max_gaussian_value = 0;

for (a_index = 0, gaussian_x = gaussian_starting_point,
recursion_factor_a = recursion_factor_a_start;
a_index < a_upper; a_index++, gaussian_x *= recursion_factor_a,
recursion_factor_a *= exp_da_squared) {

if constexpr (is_non_orthogonal) {
recursion_factor_bc_pow_b = 1;
} else {
y = start_position_y;
}
for (b_index = 0, gaussian_y = gaussian_starting_point,
recursion_factor_b = recursion_factor_b_start;
b_index < b_upper; b_index++,
gaussian_y *= recursion_factor_b * recursion_factor_ab_pow_a,
recursion_factor_b *= exp_db_squared) {

if constexpr (is_non_orthogonal) {
x = start_position_x + a_index * dxyz_dabc[0] +
b_index * dxyz_dabc[3];
y = start_position_y + a_index * dxyz_dabc[1] +
b_index * dxyz_dabc[4];
z = start_position_z + a_index * dxyz_dabc[2] +
b_index * dxyz_dabc[5];
} else {
z = start_position_z;
}
for (c_index = 0, gaussian_z = gaussian_starting_point,
recursion_factor_c = recursion_factor_c_start;
c_index < c_upper; c_index++,
gaussian_z *= recursion_factor_c * recursion_factor_ac_pow_a *
recursion_factor_bc_pow_b,
recursion_factor_c *= exp_dc_squared) {

const double r_i = sqrt(distance_squared(x - i_x, y - i_y, z - i_z));
const double r_j = sqrt(distance_squared(x - j_x, y - j_y, z - j_z));
const double r_p =
sqrt(distance_squared(x - pair_x, y - pair_y, z - pair_z));

const double approxmate_polynomial =
approximate_polynomial_value<double, i_angular, j_angular>(
r_i, r_j, r_p, derivative_order);

const double gaussian = gaussian_x * gaussian_y * gaussian_z;

const double approximate_value =
abs(4.0 * M_PI * r_p * r_p * pair_prefactor *
approxmate_polynomial * gaussian);

max_gaussian_value = max(max_gaussian_value, approximate_value);

if constexpr (is_non_orthogonal) {
x += dxyz_dabc[6];
y += dxyz_dabc[7];
z += dxyz_dabc[8];
} else {
z += dxyz_dabc[8];
}
}

if constexpr (is_non_orthogonal) {
recursion_factor_bc_pow_b *= exp_dbdc;
} else {
y += dxyz_dabc[4];
}
}

if constexpr (is_non_orthogonal) {
recursion_factor_ab_pow_a *= exp_dadb;
recursion_factor_ac_pow_a *= exp_dadc;
} else {
x += dxyz_dabc[0];
}
}

if (max_gaussian_value < threshold && is_valid_pair) {
sorted_pairs_per_local_grid[i_pair_index] = -1;
atomicAdd(n_pairs_per_local_grid + block_index, -1);
}
}
}

#define tailor_gaussian_pairs_kernel_macro(li, lj) \
tailor_gaussian_pairs_kernel<li, lj, is_non_orthogonal> \
<<<block_grid, block_size>>>( \
sorted_pairs_per_local_grid, n_pairs_per_local_grid, \
non_trivial_pairs, i_shells, j_shells, n_j_shells, \
shell_to_ao_indices, accumulated_n_pairs_per_local_grid, \
sorted_block_index, image_indices, vectors_to_neighboring_images, \
n_images, mesh_a, mesh_b, mesh_c, atm, bas, env, threshold, \
derivative_order);

#define tailor_gaussian_pairs_kernel_case_macro(li, lj) \
case (li * 10 + lj): \
tailor_gaussian_pairs_kernel_macro(li, lj); \
break

template <bool is_non_orthogonal>
int tailor_gaussian_pairs_driver(
int *sorted_pairs_per_local_grid, int *n_pairs_per_local_grid,
const int i_angular, const int j_angular, const int *non_trivial_pairs,
const int *i_shells, const int *j_shells, const int n_j_shells,
const int *shell_to_ao_indices,
const int *accumulated_n_pairs_per_local_grid,
const int *sorted_block_index, const int n_contributing_blocks,
const int *image_indices, const double *vectors_to_neighboring_images,
const int n_images, const int *mesh, const int *atm, const int *bas,
const double *env, const double threshold, const int derivative_order) {
dim3 block_size(BLOCK_DIM_XYZ, BLOCK_DIM_XYZ, BLOCK_DIM_XYZ);
int mesh_a = mesh[0];
int mesh_b = mesh[1];
int mesh_c = mesh[2];
dim3 block_grid(n_contributing_blocks, 1, 1);
switch (i_angular * 10 + j_angular) {
tailor_gaussian_pairs_kernel_case_macro(0, 0);
tailor_gaussian_pairs_kernel_case_macro(0, 1);
tailor_gaussian_pairs_kernel_case_macro(0, 2);
tailor_gaussian_pairs_kernel_case_macro(0, 3);
tailor_gaussian_pairs_kernel_case_macro(0, 4);
tailor_gaussian_pairs_kernel_case_macro(1, 0);
tailor_gaussian_pairs_kernel_case_macro(1, 1);
tailor_gaussian_pairs_kernel_case_macro(1, 2);
tailor_gaussian_pairs_kernel_case_macro(1, 3);
tailor_gaussian_pairs_kernel_case_macro(1, 4);
tailor_gaussian_pairs_kernel_case_macro(2, 0);
tailor_gaussian_pairs_kernel_case_macro(2, 1);
tailor_gaussian_pairs_kernel_case_macro(2, 2);
tailor_gaussian_pairs_kernel_case_macro(2, 3);
tailor_gaussian_pairs_kernel_case_macro(2, 4);
tailor_gaussian_pairs_kernel_case_macro(3, 0);
tailor_gaussian_pairs_kernel_case_macro(3, 1);
tailor_gaussian_pairs_kernel_case_macro(3, 2);
tailor_gaussian_pairs_kernel_case_macro(3, 3);
tailor_gaussian_pairs_kernel_case_macro(3, 4);
tailor_gaussian_pairs_kernel_case_macro(4, 0);
tailor_gaussian_pairs_kernel_case_macro(4, 1);
tailor_gaussian_pairs_kernel_case_macro(4, 2);
tailor_gaussian_pairs_kernel_case_macro(4, 3);
tailor_gaussian_pairs_kernel_case_macro(4, 4);
default:
fprintf(stderr,
"angular momentum pair %d, %d is not supported in "
"evaluate_density_driver\n",
i_angular, j_angular);
return 1;
}

return checkCudaErrors(cudaPeekAtLastError());
}

} // namespace gpu4pyscf::gpbc::multi_grid
30 changes: 30 additions & 0 deletions gpu4pyscf/lib/multigrid/multigrid_v2/utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,36 @@ __host__ __device__ T distance_squared(const T x, const T y, const T z) {
return x * x + y * y + z * z;
}

template <typename T, int i_angular, int j_angular>
__host__ __device__ T approximate_polynomial_value(const double r_i,
const double r_j,
const double r_p,
const int derivative_order) {

T result = pow(r_i, i_angular) * pow(r_j, j_angular);

if (derivative_order > 0) {
result *= 2.0 * r_p;
if constexpr (i_angular > 0) {
result += i_angular * pow(r_i, i_angular - 1) * pow(r_j, j_angular);
}

if constexpr (j_angular > 0) {
result += j_angular * pow(r_i, i_angular) * pow(r_j, j_angular - 1);
}
}

if (derivative_order > 1) {
result *= 2.0 * r_p;
if constexpr (i_angular > 0 && j_angular > 0) {
result += i_angular * j_angular * pow(r_i, i_angular - 1) *
pow(r_j, j_angular - 1);
}
}

return result;
}

template <typename T, int ANG> __device__ constexpr T common_fac_sp() {
if constexpr (ANG == 0) {
return 0.282094791773878143;
Expand Down
Loading
Loading