Skip to content

Commit

Permalink
minor - fix style in gen templates
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Dec 16, 2024
1 parent 4eda27c commit 688b547
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 31 deletions.
43 changes: 27 additions & 16 deletions include/ceed/jit-source/cuda/cuda-gen-templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,14 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d];

for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
}
}
}

//------------------------------------------------------------------------------
Expand All @@ -194,13 +195,14 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n
template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
inline __device__ void ReadLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
CeedScalar *__restrict__ r_u) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;

for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
}
}
}

//------------------------------------------------------------------------------
Expand Down Expand Up @@ -238,13 +240,14 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d];

for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
}
}
}

//------------------------------------------------------------------------------
Expand All @@ -253,13 +256,14 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
inline __device__ void WriteLVecStrided3d(SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
CeedScalar *__restrict__ d_v) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;

for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
}
}
}

//------------------------------------------------------------------------------
Expand All @@ -274,15 +278,19 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
__syncthreads();
// X derivative
r_V[comp + 0 * NUM_COMP] = 0.0;
for (CeedInt i = 0; i < Q_1d; i++)
r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];
}
// Y derivative
r_V[comp + 1 * NUM_COMP] = 0.0;
for (CeedInt i = 0; i < Q_1d; i++)
r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];
}
// Z derivative
r_V[comp + 2 * NUM_COMP] = 0.0;
for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
}
__syncthreads();
}
}
Expand All @@ -296,21 +304,24 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const Ce
CeedScalar *__restrict__ r_V) {
if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
// X derivative
data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
__syncthreads();
for (CeedInt i = 0; i < Q_1d; i++)
r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative)
// X derivative
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];
}
__syncthreads();
// Y derivative
data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
__syncthreads();
for (CeedInt i = 0; i < Q_1d; i++)
r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];
}
__syncthreads();
// Z derivative
for (CeedInt i = 0; i < Q_1d; i++)
r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
}
}
}
}
41 changes: 26 additions & 15 deletions include/ceed/jit-source/hip/hip-gen-templates.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,27 +145,29 @@ inline __device__ void WriteLVecStrided2d(SharedData_Hip &data, const CeedInt el
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
inline __device__ void ReadLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d];

for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
}
}
}

//------------------------------------------------------------------------------
// L-vector -> E-vector, strided
//------------------------------------------------------------------------------
template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
inline __device__ void ReadLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;

for (CeedInt comp = 0; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
}
}
}

//------------------------------------------------------------------------------
Expand Down Expand Up @@ -203,13 +205,14 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Hip &data, const CeedIn
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d];

for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
}
}
}

//------------------------------------------------------------------------------
Expand All @@ -218,13 +221,14 @@ inline __device__ void WriteLVecStandard3d(SharedData_Hip &data, const CeedInt n
template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
inline __device__ void WriteLVecStrided3d(SharedData_Hip &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
CeedScalar *__restrict__ d_v) {
if (data.t_id_x < P_1d && data.t_id_y < P_1d)
if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
for (CeedInt z = 0; z < P_1d; z++) {
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;

for (CeedInt comp = 0; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
}
}
}

//------------------------------------------------------------------------------
Expand All @@ -239,15 +243,19 @@ inline __device__ void GradColloSlice3d(SharedData_Hip &data, const CeedInt q, c
__syncthreads();
// X derivative
r_V[comp + 0 * NUM_COMP] = 0.0;
for (CeedInt i = 0; i < Q_1d; i++)
r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice[i + data.t_id_y * T_1D];
}
// Y derivative
r_V[comp + 1 * NUM_COMP] = 0.0;
for (CeedInt i = 0; i < Q_1d; i++)
r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice[data.t_id_x + i * T_1D];
}
// Z derivative
r_V[comp + 2 * NUM_COMP] = 0.0;
for (CeedInt i = 0; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
}
__syncthreads();
}
}
Expand All @@ -264,18 +272,21 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Hip &data, const Cee
// X derivative
data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
__syncthreads();
for (CeedInt i = 0; i < Q_1d; i++)
r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D]; // Contract x direction (X derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice[i + data.t_id_y * T_1D];
}
__syncthreads();
// Y derivative
data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
__syncthreads();
for (CeedInt i = 0; i < Q_1d; i++)
r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D]; // Contract y direction (Y derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice[data.t_id_x + i * T_1D];
}
__syncthreads();
// Z derivative
for (CeedInt i = 0; i < Q_1d; i++)
r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative)
for (CeedInt i = 0; i < Q_1d; i++) {
r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
}
}
}
}

0 comments on commit 688b547

Please sign in to comment.