diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index 67f3d7ac7c..33b07a3087 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -64,7 +64,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, if (dim == 1) { CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); // avoid >512 total threads - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -77,7 +77,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; // elems_per_block must be at least 1 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -88,7 +88,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, } } else if (dim == 3) { CeedInt elems_per_block = 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -115,7 +115,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, if (dim == 1) { CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); // avoid >512 total threads - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -128,7 +128,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; // elems_per_block must be at least 1 CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -139,7 +139,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, } } else if (dim == 3) { CeedInt elems_per_block = 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -159,19 +159,19 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add, void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v}; if (dim == 1) { const CeedInt elems_per_block = block_size / Q_1d; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); } else if (dim == 2) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } else if (dim == 3) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } @@ -211,9 +211,9 @@ static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt nu static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; - CeedInt Q_1d, dim, max_num_points = num_points[0]; - const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 32; + Ceed_Cuda *ceed_Cuda; + CeedInt Q_1d, dim, num_comp, max_num_points = num_points[0]; + const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; const CeedScalar *d_x, *d_u; CeedScalar *d_v; CeedBasis_Cuda_shared *data; @@ -221,6 +221,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad CeedCallBackend(CeedBasisGetData(basis, &data)); CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); CeedCallBackend(CeedBasisGetDimension(basis, &dim)); + CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); // Weight handled separately if (eval_mode == CEED_EVAL_WEIGHT) { @@ -229,14 +230,13 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad } CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); + CeedCallBackend(CeedGetData(ceed, &ceed_Cuda)); // Check padded to uniform number of points per elem for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]); { - CeedInt num_comp, q_comp; + CeedInt q_comp; CeedSize len, len_required; - - CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp)); CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len)); len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points; @@ -285,15 +285,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad } // -- Compile kernels - const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; + const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; CeedInt num_comp; if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", - Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, - "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", - max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); + CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", + CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), + "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); @@ -323,17 +322,76 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + CeedInt P_1d, Q_1d; - CeedCallBackend( - CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + + CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B)); + void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, + 1)); // avoid >512 total threads + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); + } else if (dim == 2) { + const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; + // elems_per_block must be at least 1 + CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); + } else if (dim == 3) { + CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); + } } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + CeedInt P_1d, Q_1d; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + + CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B)); + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, + 1)); // avoid >512 total threads + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); + } else if (dim == 2) { + const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8}; + // elems_per_block must be at least 1 + CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); + } else if (dim == 3) { + CeedInt elems_per_block = 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); + } } break; case CEED_EVAL_WEIGHT: case CEED_EVAL_NONE: /* handled separately below */ diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c index b08d1fa271..3926623a21 100644 --- a/backends/hip-shared/ceed-hip-shared-basis.c +++ b/backends/hip-shared/ceed-hip-shared-basis.c @@ -123,7 +123,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, if (dim == 1) { CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -135,7 +135,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, } else if (dim == 2) { // Check if required threads is small enough to do multiple elems const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -146,7 +146,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, } } else if (dim == 3) { const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -173,7 +173,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, if (dim == 1) { CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; elems_per_block = elems_per_block > 0 ? elems_per_block : 1; - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -185,7 +185,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, } else if (dim == 2) { // Check if required threads is small enough to do multiple elems const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -196,7 +196,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, } } else if (dim == 3) { const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); - CeedInt grid = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); if (t_mode == CEED_TRANSPOSE) { @@ -218,19 +218,19 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add, if (dim == 1) { const CeedInt opt_elems = block_size / Q_1d; const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args)); } else if (dim == 2) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } else if (dim == 3) { const CeedInt opt_elems = block_size / (Q_1d * Q_1d); const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; - const CeedInt grid_size = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0); + const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)); } @@ -271,8 +271,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) { Ceed ceed; CeedInt Q_1d, dim, max_num_points = num_points[0]; - const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; - const int max_block_size = 32; + const CeedInt is_transpose = t_mode == CEED_TRANSPOSE; const CeedScalar *d_x, *d_u; CeedScalar *d_v; CeedBasis_Hip_shared *data; @@ -344,15 +343,15 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add } // -- Compile kernels - const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; + const char basis_kernel_source[] = "// AtPoints basis source\n#include \n"; CeedInt num_comp; if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints)); CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp)); - CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN", - Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, - "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", - max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1))); + CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D", + CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), + "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE", + data->block_sizes[0])); CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints)); CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints)); @@ -382,17 +381,72 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add // Basis action switch (eval_mode) { case CEED_EVAL_INTERP: { - void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + CeedInt P_1d, Q_1d; + CeedInt block_size = data->block_sizes[0]; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); - CeedCallBackend( - CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1, + elems_per_block, shared_mem, interp_args)); + } else if (dim == 2) { + // Check if required threads is small enough to do multiple elems + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); + } else if (dim == 3) { + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, + thread_1d, elems_per_block, shared_mem, interp_args)); + } } break; case CEED_EVAL_GRAD: { - void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; - const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size); + CeedInt P_1d, Q_1d; + CeedInt block_size = data->block_sizes[0]; + + CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d)); + CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d)); + CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); + void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v}; + + if (dim == 1) { + CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; + elems_per_block = elems_per_block > 0 ? elems_per_block : 1; + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args)); + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1, + elems_per_block, shared_mem, grad_args)); + } else if (dim == 2) { + // Check if required threads is small enough to do multiple elems + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); + } else if (dim == 3) { + const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); + CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); + CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); + + CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d, + elems_per_block, shared_mem, grad_args)); + } } break; case CEED_EVAL_WEIGHT: case CEED_EVAL_NONE: /* handled separately below */ diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h new file mode 100644 index 0000000000..35681df470 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h @@ -0,0 +1,568 @@ +// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA shared memory tensor product basis AtPoints templates +#include + +//------------------------------------------------------------------------------ +// Chebyshev values +//------------------------------------------------------------------------------ +template +inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { + chebyshev_x[0] = 1.0; + chebyshev_x[1] = 2 * x; + for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; +} + +template +inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { + CeedScalar chebyshev_x[3]; + + chebyshev_x[1] = 1.0; + chebyshev_x[2] = 2 * x; + chebyshev_dx[0] = 0.0; + chebyshev_dx[1] = 2.0; + for (CeedInt i = 2; i < Q_1D; i++) { + chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; + chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; + } +} + +//------------------------------------------------------------------------------ +// 1D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 1D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; + } +} + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 2D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp]; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + p) % Q_1D; + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; + } + } + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; + } + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + p) % Q_1D; + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 3D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i] * z; + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp] * z; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = ((j + p) % Q_1D); + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + if (dim == 2) ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; + } + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X, + CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract y and z direction + if (dim == 2) ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; + } + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = ((j + p) % Q_1D); + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// Loops over points +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, + const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + if (DIM == 1) { + InterpAtPoints1d(data, i, r_C, r_X, r_V); + } else if (DIM == 2) { + InterpAtPoints2d(data, i, r_C, r_X, r_V); + } else if (DIM == 3) { + InterpAtPoints3d(data, i, r_C, r_X, r_V); + } + if (i < NUM_PTS) { + for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j]; + } + } +} + +//------------------------------------------------------------------------------ +// Interpolate from points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, + CeedScalar *__restrict__ r_C) { + // Clear register + for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; + + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + for (CeedInt j = 0; j < NUM_COMP; j++) { + if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; + else r_U[j] = 0.0; + } + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// Gradient at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, + const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + if (DIM == 1) { + GradAtPoints1d(data, i, r_C, r_X, r_V); + } else if (DIM == 2) { + GradAtPoints2d(data, i, r_C, r_X, r_V); + } else if (DIM == 3) { + GradAtPoints3d(data, i, r_C, r_X, r_V); + } + if (i < NUM_PTS) { + for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j]; + } + } +} + +//------------------------------------------------------------------------------ +// Grad from points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, + CeedScalar *__restrict__ r_C) { + // Clear register + for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; + + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + for (CeedInt j = 0; j < NUM_COMP * DIM; j++) { + if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; + else r_U[j] = 0.0; + } + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h new file mode 100644 index 0000000000..cfc6899476 --- /dev/null +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h @@ -0,0 +1,170 @@ +// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for CUDA tensor product basis with AtPoints evaluation +#include + +#include "cuda-shared-basis-read-write-templates.h" +#include "cuda-shared-basis-tensor-at-points-templates.h" +#include "cuda-shared-basis-tensor-templates.h" + +//------------------------------------------------------------------------------ +// Tensor Basis Kernels AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp +//------------------------------------------------------------------------------ +extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, c_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, c_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, c_B, r_C); + } + + // Map to points + InterpAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, + &d_V[elem * BASIS_NUM_PTS]); + } +} + +extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map from points + InterpTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], + &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, c_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, c_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, c_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, c_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, c_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, c_B, r_C); + } + + // Map to points + GradAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, + &d_V[elem * BASIS_NUM_PTS]); + } +} + +extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + SharedData_Cuda data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map from points + GradTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], + &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, c_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, c_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, c_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h new file mode 100644 index 0000000000..7844810c2d --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h @@ -0,0 +1,568 @@ +// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP shared memory tensor product basis AtPoints templates +#include + +//------------------------------------------------------------------------------ +// Chebyshev values +//------------------------------------------------------------------------------ +template +inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) { + chebyshev_x[0] = 1.0; + chebyshev_x[1] = 2 * x; + for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2]; +} + +template +inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) { + CeedScalar chebyshev_x[3]; + + chebyshev_x[1] = 1.0; + chebyshev_x[2] = 2 * x; + chebyshev_dx[0] = 0.0; + chebyshev_dx[1] = 2.0; + for (CeedInt i = 2; i < Q_1D; i++) { + chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3]; + chebyshev_dx[i] = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2]; + } +} + +//------------------------------------------------------------------------------ +// 1D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 1D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Load coefficients + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp]; + __syncthreads(); + // Contract x direction + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * data.slice[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 1D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + CeedScalar chebyshev_x[Q_1D]; + + ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + // Clear shared memory + if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0; + __syncthreads(); + // Contract x direction + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p]; + } +} + +//------------------------------------------------------------------------------ +// 2D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 2D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i]; + } + } +} + +//------------------------------------------------------------------------------ +// 2D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y direction + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp]; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + p) % Q_1D; + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp]; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i]; + } + } + } +} + +//------------------------------------------------------------------------------ +// 2D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + for (CeedInt dim = 0; dim < 2; dim++) { + // Contract y direction + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP]; + } + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = (j + p) % Q_1D; + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } +} + +//------------------------------------------------------------------------------ +// 3D +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// 3D interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp] += chebyshev_x[i] * buffer[i] * z; + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D interpolate transpose +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + // Contract y and z direction + ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp] * z; + } + // Contract x direction + ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = ((j + p) % Q_1D); + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X, + CeedScalar *__restrict__ r_V) { + for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0; + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Load coefficients + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D]; + __syncthreads(); + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = 0.0; + for (CeedInt j = 0; j < Q_1D; j++) { + buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D]; + } + } + // Contract y and z direction + if (dim == 2) ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z; + } + } + } + } +} + +//------------------------------------------------------------------------------ +// 3D derivatives transpose +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U, + const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + for (CeedInt k = 0; k < Q_1D; k++) { + CeedScalar buffer[Q_1D]; + CeedScalar chebyshev_x[Q_1D]; + + // Clear shared memory + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0; + __syncthreads(); + for (CeedInt dim = 0; dim < 3; dim++) { + // Contract y and z direction + if (dim == 2) ChebyshevDerivativeAtPoint(r_X[2], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[2], chebyshev_x); + const CeedScalar z = chebyshev_x[k]; + + if (dim == 1) ChebyshevDerivativeAtPoint(r_X[1], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[1], chebyshev_x); + for (CeedInt i = 0; i < Q_1D; i++) { + buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z; + } + // Contract x direction + if (dim == 0) ChebyshevDerivativeAtPoint(r_X[0], chebyshev_x); + else ChebyshevPolynomialsAtPoint(r_X[0], chebyshev_x); + if (p < NUM_POINTS) { + for (CeedInt i = 0; i < Q_1D; i++) { + // Note: shifting to avoid atomic adds + const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + + for (CeedInt j = 0; j < Q_1D; j++) { + const CeedInt jj = ((j + p) % Q_1D); + + atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); + } + } + } + } + // Pull from shared to register + __syncthreads(); + if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D]; + } + } +} + +//------------------------------------------------------------------------------ +// Loops over points +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interpolate to points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, + const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + if (DIM == 1) { + InterpAtPoints1d(data, i, r_C, r_X, r_V); + } else if (DIM == 2) { + InterpAtPoints2d(data, i, r_C, r_X, r_V); + } else if (DIM == 3) { + InterpAtPoints3d(data, i, r_C, r_X, r_V); + } + if (i < NUM_PTS) { + for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j]; + } + } +} + +//------------------------------------------------------------------------------ +// Interpolate from points +//------------------------------------------------------------------------------ +template +inline __device__ void InterpTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, + CeedScalar *__restrict__ r_C) { + // Clear register + for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; + + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + for (CeedInt j = 0; j < NUM_COMP; j++) { + if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; + else r_U[j] = 0.0; + } + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); +} + +//------------------------------------------------------------------------------ +// Gradient at points +//------------------------------------------------------------------------------ +template +inline __device__ void GradAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, + const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + if (DIM == 1) { + GradAtPoints1d(data, i, r_C, r_X, r_V); + } else if (DIM == 2) { + GradAtPoints2d(data, i, r_C, r_X, r_V); + } else if (DIM == 3) { + GradAtPoints3d(data, i, r_C, r_X, r_V); + } + if (i < NUM_PTS) { + for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j]; + } + } +} + +//------------------------------------------------------------------------------ +// Grad from points +//------------------------------------------------------------------------------ +template +inline __device__ void GradTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem, + const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, + CeedScalar *__restrict__ r_C) { + // Clear register + for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; + + const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % NUM_PTS; + CeedScalar r_X[DIM]; + + for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; + for (CeedInt j = 0; j < NUM_COMP * DIM; j++) { + if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; + else r_U[j] = 0.0; + } + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); +} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h new file mode 100644 index 0000000000..355f53d0f4 --- /dev/null +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h @@ -0,0 +1,192 @@ +// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +/// @file +/// Internal header for HIP tensor product basis with AtPoints evaluation +#include + +#include "hip-shared-basis-read-write-templates.h" +#include "hip-shared-basis-tensor-at-points-templates.h" +#include "hip-shared-basis-tensor-templates.h" + +//------------------------------------------------------------------------------ +// Tensor Basis Kernels AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// Interp +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_chebyshev_interp_1d, s_B); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, s_B, r_C); + } + + // Map to points + InterpAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, + &d_V[elem * BASIS_NUM_PTS]); + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_chebyshev_interp_1d, s_B); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map from points + InterpTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], + &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} + +//------------------------------------------------------------------------------ +// Grad +//------------------------------------------------------------------------------ +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_chebyshev_interp_1d, s_B); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map to coefficients + if (BASIS_DIM == 1) { + ReadElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U); + Interp1d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 2) { + ReadElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor2d(data, r_U, s_B, r_C); + } else if (BASIS_DIM == 3) { + ReadElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U); + InterpTensor3d(data, r_U, s_B, r_C); + } + + // Map to points + GradAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, + &d_V[elem * BASIS_NUM_PTS]); + } +} + +extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ + void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem, + const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) { + extern __shared__ CeedScalar slice[]; + + // load chebyshev_interp_1d into shared memory + __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D]; + loadMatrix(d_chebyshev_interp_1d, s_B); + __syncthreads(); + + SharedData_Hip data; + data.t_id_x = threadIdx.x; + data.t_id_y = threadIdx.y; + data.t_id_z = threadIdx.z; + data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; + data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + + CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; + CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; + + // Apply basis element by element + for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Map from points + GradTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], + &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + + // Map from coefficients + if (BASIS_DIM == 1) { + InterpTranspose1d(data, r_C, s_B, r_V); + SumElementStrided1d(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 2) { + InterpTransposeTensor2d(data, r_C, s_B, r_V); + SumElementStrided2d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } else if (BASIS_DIM == 3) { + InterpTransposeTensor3d(data, r_C, s_B, r_V); + SumElementStrided3d(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem, + BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V); + } + } +} diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c index 1cc767cb08..1879464450 100644 --- a/interface/ceed-preconditioning.c +++ b/interface/ceed-preconditioning.c @@ -452,8 +452,8 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C CeedElemRestriction elem_rstr_in, elem_rstr_out, index_elem_rstr_in, index_elem_rstr_out; CeedCall(CeedOperatorIsComposite(op, &is_composite)); - CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); CeedCall(CeedOperatorGetCeed(op, &ceed)); + CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported"); CeedCall(CeedOperatorGetActiveVectorLengths(op, &num_nodes_in, &num_nodes_out)); CeedCall(CeedOperatorGetActiveElemRestrictions(op, &elem_rstr_in, &elem_rstr_out)); diff --git a/tests/t354-basis.c b/tests/t354-basis.c index e137f8c44f..4d3402b257 100644 --- a/tests/t354-basis.c +++ b/tests/t354-basis.c @@ -1,6 +1,6 @@ /// @file -/// Test polynomial interpolation to arbitrary points in multiple dimensions -/// \test Test polynomial interpolation to arbitrary points in multiple dimensions +/// Test polynomial interpolation transpose to arbitrary points in multiple dimensions +/// \test Test polynomial interpolation transpose to arbitrary points in multiple dimensions #include #include #include @@ -85,7 +85,7 @@ int main(int argc, char **argv) { CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point); CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array); for (CeedInt j = 0; j < p_dim; j++) fx += u_array[j] * u_point_array[j]; - if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) { + if (fabs(v_array[i] - fx) > 500. * CEED_EPSILON) { // LCOV_EXCL_START printf("[%" CeedInt_FMT "] %f != %f = f(%f", dim, v_array[i], fx, coord[0]); for (CeedInt d = 1; d < dim; d++) printf(", %f", coord[d]);