Skip to content

Commit

Permalink
gpu - fix AtPoints transpose shift
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Jan 10, 2025
1 parent 1a63be7 commit a24d84e
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce
// Contract x direction
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
}
}
// Pull from shared to register
Expand Down Expand Up @@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed
// Contract x direction
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
}
}
// Pull from shared to register
Expand Down Expand Up @@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = (j + p) % Q_1D;
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down Expand Up @@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = (j + p) % Q_1D;
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down Expand Up @@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = ((j + p) % Q_1D);
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down Expand Up @@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = ((j + p) % Q_1D);
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee
// Contract x direction
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
}
}
// Pull from shared to register
Expand Down Expand Up @@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI
// Contract x direction
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]);
}
}
// Pull from shared to register
Expand Down Expand Up @@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = (j + p) % Q_1D;
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down Expand Up @@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = (j + p) % Q_1D;
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down Expand Up @@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = ((j + p) % Q_1D);
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down Expand Up @@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
if (p < NUM_POINTS) {
for (CeedInt i = 0; i < Q_1D; i++) {
// Note: shifting to avoid atomic adds
const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
const CeedInt ii = (i + data.t_id_x) % Q_1D;

for (CeedInt j = 0; j < Q_1D; j++) {
const CeedInt jj = ((j + p) % Q_1D);
const CeedInt jj = (j + data.t_id_y) % Q_1D;

atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
}
Expand Down

0 comments on commit a24d84e

Please sign in to comment.