diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
index 67f3d7ac7c..33b07a3087 100644
--- a/backends/cuda-shared/ceed-cuda-shared-basis.c
+++ b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -64,7 +64,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
                                                                                                  1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -77,7 +77,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -88,7 +88,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -115,7 +115,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
                                                                                                  1));  // avoid >512 total threads
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -128,7 +128,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
         // elems_per_block must be at least 1
         CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -139,7 +139,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         CeedInt elems_per_block = 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -159,19 +159,19 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
       void *weight_args[] = {(void *)&num_elem, (void *)&data->d_q_weight_1d, &d_v};
       if (dim == 1) {
         const CeedInt elems_per_block = block_size / Q_1d;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
       } else if (dim == 2) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       } else if (dim == 3) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       }
@@ -211,9 +211,9 @@ static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt nu
 static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_add, const CeedInt num_elem, const CeedInt *num_points,
                                                   CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                   ceed;
-  CeedInt                Q_1d, dim, max_num_points = num_points[0];
-  const CeedInt          is_transpose   = t_mode == CEED_TRANSPOSE;
-  const int              max_block_size = 32;
+  Ceed_Cuda             *ceed_Cuda;
+  CeedInt                Q_1d, dim, num_comp, max_num_points = num_points[0];
+  const CeedInt          is_transpose = t_mode == CEED_TRANSPOSE;
   const CeedScalar      *d_x, *d_u;
   CeedScalar            *d_v;
   CeedBasis_Cuda_shared *data;
@@ -221,6 +221,7 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   CeedCallBackend(CeedBasisGetData(basis, &data));
   CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
   CeedCallBackend(CeedBasisGetDimension(basis, &dim));
+  CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
 
   // Weight handled separately
   if (eval_mode == CEED_EVAL_WEIGHT) {
@@ -229,14 +230,13 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   }
 
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
+  CeedCallBackend(CeedGetData(ceed, &ceed_Cuda));
 
   // Check padded to uniform number of points per elem
   for (CeedInt i = 1; i < num_elem; i++) max_num_points = CeedIntMax(max_num_points, num_points[i]);
   {
-    CeedInt  num_comp, q_comp;
+    CeedInt  q_comp;
     CeedSize len, len_required;
-
-    CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
     CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, eval_mode, &q_comp));
     CeedCallBackend(CeedVectorGetLength(is_transpose ? u : v, &len));
     len_required = (CeedSize)num_comp * (CeedSize)q_comp * (CeedSize)num_elem * (CeedSize)max_num_points;
@@ -285,15 +285,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
     }
 
     // -- Compile kernels
-    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref-basis-tensor-at-points.h>\n";
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h>\n";
     CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallCuda(ceed, cuModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                     Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
-                                     "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                     max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
+    CeedCallBackend(CeedCompile_Cuda(ceed, basis_kernel_source, &data->moduleAtPoints, 8, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+                                     CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
+                                     "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
@@ -323,17 +322,76 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
 
-      CeedCallBackend(
-          CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+
+      CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
+      void *interp_args[] = {(void *)&num_elem, &data->c_B, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
+                                                                                                 1));  // avoid >512 total threads
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
+                                                    elems_per_block, shared_mem, interp_args));
+      } else if (dim == 2) {
+        const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
+        // elems_per_block must be at least 1
+        CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                    thread_1d, elems_per_block, shared_mem, interp_args));
+      } else if (dim == 3) {
+        CeedInt elems_per_block = 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                    thread_1d, elems_per_block, shared_mem, interp_args));
+      }
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
+
+      CeedCallBackend(CeedInit_CudaInterp(data->d_chebyshev_interp_1d, P_1d, Q_1d, &data->c_B));
+      void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d,
+                                                                                                 1));  // avoid >512 total threads
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
+                                                    elems_per_block, shared_mem, grad_args));
+      } else if (dim == 2) {
+        const CeedInt opt_elems[7] = {0, 32, 8, 6, 4, 2, 8};
+        // elems_per_block must be at least 1
+        CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                    elems_per_block, shared_mem, grad_args));
+      } else if (dim == 3) {
+        CeedInt elems_per_block = 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
-      CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
+        CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                    elems_per_block, shared_mem, grad_args));
+      }
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/backends/hip-shared/ceed-hip-shared-basis.c b/backends/hip-shared/ceed-hip-shared-basis.c
index b08d1fa271..3926623a21 100644
--- a/backends/hip-shared/ceed-hip-shared-basis.c
+++ b/backends/hip-shared/ceed-hip-shared-basis.c
@@ -123,7 +123,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -135,7 +135,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -146,7 +146,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -173,7 +173,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
         elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
-        CeedInt grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -185,7 +185,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       } else if (dim == 2) {
         // Check if required threads is small enough to do multiple elems
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -196,7 +196,7 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
         }
       } else if (dim == 3) {
         const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
-        CeedInt       grid            = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
         CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
 
         if (t_mode == CEED_TRANSPOSE) {
@@ -218,19 +218,19 @@ static int CeedBasisApplyTensorCore_Hip_shared(CeedBasis basis, bool apply_add,
       if (dim == 1) {
         const CeedInt opt_elems       = block_size / Q_1d;
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weight_args));
       } else if (dim == 2) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       } else if (dim == 3) {
         const CeedInt opt_elems       = block_size / (Q_1d * Q_1d);
         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;
-        const CeedInt grid_size       = num_elem / elems_per_block + ((num_elem / elems_per_block * elems_per_block < num_elem) ? 1 : 0);
+        const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
 
         CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args));
       }
@@ -271,8 +271,7 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
                                                  CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedVector x_ref, CeedVector u, CeedVector v) {
   Ceed                  ceed;
   CeedInt               Q_1d, dim, max_num_points = num_points[0];
-  const CeedInt         is_transpose   = t_mode == CEED_TRANSPOSE;
-  const int             max_block_size = 32;
+  const CeedInt         is_transpose = t_mode == CEED_TRANSPOSE;
   const CeedScalar     *d_x, *d_u;
   CeedScalar           *d_v;
   CeedBasis_Hip_shared *data;
@@ -344,15 +343,15 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
     }
 
     // -- Compile kernels
-    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-ref-basis-tensor-at-points.h>\n";
+    const char basis_kernel_source[] = "// AtPoints basis source\n#include <ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h>\n";
     CeedInt    num_comp;
 
     if (data->moduleAtPoints) CeedCallHip(ceed, hipModuleUnload(data->moduleAtPoints));
     CeedCallBackend(CeedBasisGetNumComponents(basis, &num_comp));
-    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "BASIS_BUF_LEN",
-                                    Q_1d * CeedIntPow(Q_1d > P_1d ? Q_1d : P_1d, dim - 1), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp,
-                                    "BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
-                                    max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
+    CeedCallBackend(CeedCompile_Hip(ceed, basis_kernel_source, &data->moduleAtPoints, 9, "BASIS_Q_1D", Q_1d, "BASIS_P_1D", P_1d, "T_1D",
+                                    CeedIntMax(Q_1d, P_1d), "BASIS_DIM", dim, "BASIS_NUM_COMP", num_comp, "BASIS_NUM_NODES", CeedIntPow(P_1d, dim),
+                                    "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS", max_num_points, "BASIS_INTERP_BLOCK_SIZE",
+                                    data->block_sizes[0]));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
     CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
@@ -382,17 +381,72 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
   // Basis action
   switch (eval_mode) {
     case CEED_EVAL_INTERP: {
-      void         *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size    = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
+      CeedInt block_size = data->block_sizes[0];
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d     = CeedIntMax(Q_1d, P_1d);
+      void   *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-      CeedCallBackend(
-          CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d, 1,
+                                                   elems_per_block, shared_mem, interp_args));
+      } else if (dim == 2) {
+        // Check if required threads is small enough to do multiple elems
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                   thread_1d, elems_per_block, shared_mem, interp_args));
+      } else if (dim == 3) {
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, grid, thread_1d,
+                                                   thread_1d, elems_per_block, shared_mem, interp_args));
+      }
     } break;
     case CEED_EVAL_GRAD: {
-      void         *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
-      const CeedInt block_size  = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
+      CeedInt P_1d, Q_1d;
+      CeedInt block_size = data->block_sizes[0];
+
+      CeedCallBackend(CeedBasisGetNumNodes1D(basis, &P_1d));
+      CeedCallBackend(CeedBasisGetNumQuadraturePoints1D(basis, &Q_1d));
+      CeedInt thread_1d   = CeedIntMax(Q_1d, P_1d);
+      void   *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
+
+      if (dim == 1) {
+        CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64;
+        elems_per_block         = elems_per_block > 0 ? elems_per_block : 1;
+        CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);
 
-      CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, 1,
+                                                   elems_per_block, shared_mem, grad_args));
+      } else if (dim == 2) {
+        // Check if required threads is small enough to do multiple elems
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                   elems_per_block, shared_mem, grad_args));
+      } else if (dim == 3) {
+        const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1);
+        CeedInt       grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);
+        CeedInt       shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);
+
+        CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, grid, thread_1d, thread_1d,
+                                                   elems_per_block, shared_mem, grad_args));
+      }
     } break;
     case CEED_EVAL_WEIGHT:
     case CEED_EVAL_NONE: /* handled separately below */
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
new file mode 100644
index 0000000000..35681df470
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h
@@ -0,0 +1,568 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA shared memory tensor product basis AtPoints templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 1D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = 0.0;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+      }
+    }
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * buffer[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = chebyshev_x[i] * r_U[comp];
+    }
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + p) % Q_1D;
+
+          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
+      }
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + p) % Q_1D;
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp] += chebyshev_x[i] * buffer[i] * z;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                                 CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp] * z;
+      }
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = ((j + p) % Q_1D);
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = 0.0;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+          }
+        }
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const CeedInt p, const CeedScalar *__restrict__ r_U, const CeedScalar *r_X,
+                                               CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+        }
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        if (p < NUM_POINTS) {
+          for (CeedInt i = 0; i < Q_1D; i++) {
+            // Note: shifting to avoid atomic adds
+            const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+            for (CeedInt j = 0; j < Q_1D; j++) {
+              const CeedInt jj = ((j + p) % Q_1D);
+
+              atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            }
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Loops over points
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interpolate to points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                      const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                               CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Gradient at points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                    const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                             const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                             CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
new file mode 100644
index 0000000000..cfc6899476
--- /dev/null
+++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
@@ -0,0 +1,170 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for CUDA tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+#include "cuda-shared-basis-read-write-templates.h"
+#include "cuda-shared-basis-tensor-at-points-templates.h"
+#include "cuda-shared-basis-tensor-templates.h"
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                          const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    }
+
+    // Map to points
+    InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                         &d_V[elem * BASIS_NUM_PTS]);
+  }
+}
+
+extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                   const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                   const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map from points
+    InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                  &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B, const CeedInt *__restrict__ points_per_elem,
+                                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, c_B, r_C);
+    }
+
+    // Map to points
+    GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                       &d_V[elem * BASIS_NUM_PTS]);
+  }
+}
+
+extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *__restrict__ c_B,
+                                                 const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ d_X,
+                                                 const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  SharedData_Cuda data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map from points
+    GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, c_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
new file mode 100644
index 0000000000..7844810c2d
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h
@@ -0,0 +1,568 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP shared memory tensor product basis AtPoints templates
+#include <ceed/types.h>
+
+//------------------------------------------------------------------------------
+// Chebyshev values
+//------------------------------------------------------------------------------
+template <int Q_1D>
+inline __device__ void ChebyshevPolynomialsAtPoint(const CeedScalar x, CeedScalar *chebyshev_x) {
+  chebyshev_x[0] = 1.0;
+  chebyshev_x[1] = 2 * x;
+  for (CeedInt i = 2; i < Q_1D; i++) chebyshev_x[i] = 2 * x * chebyshev_x[i - 1] - chebyshev_x[i - 2];
+}
+
+template <int Q_1D>
+inline __device__ void ChebyshevDerivativeAtPoint(const CeedScalar x, CeedScalar *chebyshev_dx) {
+  CeedScalar chebyshev_x[3];
+
+  chebyshev_x[1]  = 1.0;
+  chebyshev_x[2]  = 2 * x;
+  chebyshev_dx[0] = 0.0;
+  chebyshev_dx[1] = 2.0;
+  for (CeedInt i = 2; i < Q_1D; i++) {
+    chebyshev_x[(i + 1) % 3] = 2 * x * chebyshev_x[(i + 0) % 3] - chebyshev_x[(i + 2) % 3];
+    chebyshev_dx[i]          = 2 * x * chebyshev_dx[i - 1] + 2 * chebyshev_x[(i + 0) % 3] - chebyshev_dx[i - 2];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 1D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Load coefficients
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * data.slice[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 1D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  CeedScalar chebyshev_x[Q_1D];
+
+  ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    // Clear shared memory
+    if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = 0.0;
+    __syncthreads();
+    // Contract x direction
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]);
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D) r_C[comp] = data.slice[p];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 2D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = 0.0;
+      for (CeedInt j = 0; j < Q_1D; j++) {
+        buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+      }
+    }
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      r_V[comp] += chebyshev_x[i] * buffer[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    // Contract y direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+    for (CeedInt i = 0; i < Q_1D; i++) {
+      buffer[i] = chebyshev_x[i] * r_U[comp];
+    }
+    // Contract x direction
+    ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+    if (p < NUM_POINTS) {
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        // Note: shifting to avoid atomic adds
+        const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          const CeedInt jj = (j + p) % Q_1D;
+
+          atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 2; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Load coefficients
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i];
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 2D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    CeedScalar buffer[Q_1D];
+    CeedScalar chebyshev_x[Q_1D];
+
+    // Clear shared memory
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+    __syncthreads();
+    for (CeedInt dim = 0; dim < 2; dim++) {
+      // Contract y direction
+      if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP];
+      }
+      // Contract x direction
+      if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = (j + p) % Q_1D;
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+    }
+    // Pull from shared to register
+    __syncthreads();
+    if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[comp] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// 3D interpolate to points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                        CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = 0.0;
+        for (CeedInt j = 0; j < Q_1D; j++) {
+          buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+        }
+      }
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        r_V[comp] += chebyshev_x[i] * buffer[i] * z;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D interpolate transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                                 const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      // Contract y and z direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+      const CeedScalar z = chebyshev_x[k];
+
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+      for (CeedInt i = 0; i < Q_1D; i++) {
+        buffer[i] = chebyshev_x[i] * r_U[comp] * z;
+      }
+      // Contract x direction
+      ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+      if (p < NUM_POINTS) {
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          // Note: shifting to avoid atomic adds
+          const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            const CeedInt jj = ((j + p) % Q_1D);
+
+            atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives at points
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_C, const CeedScalar *__restrict__ r_X,
+                                      CeedScalar *__restrict__ r_V) {
+  for (CeedInt i = 0; i < NUM_COMP * 3; i++) r_V[i] = 0.0;
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Load coefficients
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = 0.0;
+          for (CeedInt j = 0; j < Q_1D; j++) {
+            buffer[i] += chebyshev_x[j] * data.slice[j + i * Q_1D];
+          }
+        }
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          r_V[comp + dim * NUM_COMP] += chebyshev_x[i] * buffer[i] * z;
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// 3D derivatives transpose
+//------------------------------------------------------------------------------
+template <int NUM_COMP, int NUM_POINTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedInt p, const CeedScalar *__restrict__ r_U,
+                                               const CeedScalar *__restrict__ r_X, CeedScalar *__restrict__ r_C) {
+  for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
+    for (CeedInt k = 0; k < Q_1D; k++) {
+      CeedScalar buffer[Q_1D];
+      CeedScalar chebyshev_x[Q_1D];
+
+      // Clear shared memory
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = 0.0;
+      __syncthreads();
+      for (CeedInt dim = 0; dim < 3; dim++) {
+        // Contract y and z direction
+        if (dim == 2) ChebyshevDerivativeAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[2], chebyshev_x);
+        const CeedScalar z = chebyshev_x[k];
+
+        if (dim == 1) ChebyshevDerivativeAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[1], chebyshev_x);
+        for (CeedInt i = 0; i < Q_1D; i++) {
+          buffer[i] = chebyshev_x[i] * r_U[comp + dim * NUM_COMP] * z;
+        }
+        // Contract x direction
+        if (dim == 0) ChebyshevDerivativeAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        else ChebyshevPolynomialsAtPoint<Q_1D>(r_X[0], chebyshev_x);
+        if (p < NUM_POINTS) {
+          for (CeedInt i = 0; i < Q_1D; i++) {
+            // Note: shifting to avoid atomic adds
+            const CeedInt ii = (i + (p / Q_1D)) % Q_1D;
+
+            for (CeedInt j = 0; j < Q_1D; j++) {
+              const CeedInt jj = ((j + p) % Q_1D);
+
+              atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]);
+            }
+          }
+        }
+      }
+      // Pull from shared to register
+      __syncthreads();
+      if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) r_C[k + comp * Q_1D] += data.slice[data.t_id_x + data.t_id_y * Q_1D];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Loops over points
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interpolate to points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                      const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Interpolate from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void InterpTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                               const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                               CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
+
+//------------------------------------------------------------------------------
+// Gradient at points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
+                                    const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    if (DIM == 1) {
+      GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 2) {
+      GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    } else if (DIM == 3) {
+      GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
+    }
+    if (i < NUM_PTS) {
+      for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad from points
+//------------------------------------------------------------------------------
+template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
+inline __device__ void GradTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
+                                             const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
+                                             CeedScalar *__restrict__ r_C) {
+  // Clear register
+  for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
+
+  const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
+
+  for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
+    const CeedInt p = i % NUM_PTS;
+    CeedScalar    r_X[DIM];
+
+    for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
+    for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
+      if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
+      else r_U[j] = 0.0;
+    }
+    if (BASIS_DIM == 1) {
+      GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 2) {
+      GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    } else if (BASIS_DIM == 3) {
+      GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
+    }
+  }
+  __syncthreads();
+}
diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
new file mode 100644
index 0000000000..355f53d0f4
--- /dev/null
+++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h
@@ -0,0 +1,192 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+/// @file
+/// Internal header for HIP tensor product basis with AtPoints evaluation
+#include <ceed/types.h>
+
+#include "hip-shared-basis-read-write-templates.h"
+#include "hip-shared-basis-tensor-at-points-templates.h"
+#include "hip-shared-basis-tensor-templates.h"
+
+//------------------------------------------------------------------------------
+// Tensor Basis Kernels AtPoints
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// Interp
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                        const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                         &d_V[elem * BASIS_NUM_PTS]);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void InterpTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                                 const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map from points
+    InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                  &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Grad
+//------------------------------------------------------------------------------
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                      const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map to coefficients
+    if (BASIS_DIM == 1) {
+      ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, d_U, r_U);
+      Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 2) {
+      ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    } else if (BASIS_DIM == 3) {
+      ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                       BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, d_U, r_U);
+      InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_U, s_B, r_C);
+    }
+
+    // Map to points
+    GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
+                                                                       &d_V[elem * BASIS_NUM_PTS]);
+  }
+}
+
+extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__
+    void GradTransposeAtPoints(const CeedInt num_elem, const CeedScalar *d_chebyshev_interp_1d, const CeedInt *points_per_elem,
+                               const CeedScalar *__restrict__ d_X, const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ d_V) {
+  extern __shared__ CeedScalar slice[];
+
+  // load chebyshev_interp_1d into shared memory
+  __shared__ CeedScalar s_B[BASIS_P_1D * BASIS_Q_1D];
+  loadMatrix<BASIS_P_1D * BASIS_Q_1D>(d_chebyshev_interp_1d, s_B);
+  __syncthreads();
+
+  SharedData_Hip data;
+  data.t_id_x = threadIdx.x;
+  data.t_id_y = threadIdx.y;
+  data.t_id_z = threadIdx.z;
+  data.t_id   = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
+  data.slice  = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);
+
+  CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
+  CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+  CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
+
+  // Apply basis element by element
+  for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
+    // Map from points
+    GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
+                                                                                &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
+
+    // Map from coefficients
+    if (BASIS_DIM == 1) {
+      InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 2) {
+      InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem, BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    } else if (BASIS_DIM == 3) {
+      InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D>(data, r_C, s_B, r_V);
+      SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1D * num_elem,
+                                                      BASIS_P_1D * BASIS_P_1D * BASIS_P_1D, r_V, d_V);
+    }
+  }
+}
diff --git a/interface/ceed-preconditioning.c b/interface/ceed-preconditioning.c
index 1cc767cb08..1879464450 100644
--- a/interface/ceed-preconditioning.c
+++ b/interface/ceed-preconditioning.c
@@ -452,8 +452,8 @@ static int CeedSingleOperatorAssembleSymbolic(CeedOperator op, CeedInt offset, C
   CeedElemRestriction elem_rstr_in, elem_rstr_out, index_elem_rstr_in, index_elem_rstr_out;
 
   CeedCall(CeedOperatorIsComposite(op, &is_composite));
-  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
   CeedCall(CeedOperatorGetCeed(op, &ceed));
+  CeedCheck(!is_composite, ceed, CEED_ERROR_UNSUPPORTED, "Composite operator not supported");
 
   CeedCall(CeedOperatorGetActiveVectorLengths(op, &num_nodes_in, &num_nodes_out));
   CeedCall(CeedOperatorGetActiveElemRestrictions(op, &elem_rstr_in, &elem_rstr_out));
diff --git a/tests/t354-basis.c b/tests/t354-basis.c
index e137f8c44f..4d3402b257 100644
--- a/tests/t354-basis.c
+++ b/tests/t354-basis.c
@@ -1,6 +1,6 @@
 /// @file
-/// Test polynomial interpolation to arbitrary points in multiple dimensions
-/// \test Test polynomial interpolation to arbitrary points in multiple dimensions
+/// Test polynomial interpolation transpose to arbitrary points in multiple dimensions
+/// \test Test polynomial interpolation transpose to arbitrary points in multiple dimensions
 #include <ceed.h>
 #include <math.h>
 #include <stdio.h>
@@ -85,7 +85,7 @@ int main(int argc, char **argv) {
       CeedBasisApplyAtPoints(basis_u, 1, num_point, CEED_TRANSPOSE, CEED_EVAL_INTERP, x_point, v_point, u_point);
       CeedVectorGetArrayRead(u_point, CEED_MEM_HOST, &u_point_array);
       for (CeedInt j = 0; j < p_dim; j++) fx += u_array[j] * u_point_array[j];
-      if (fabs(v_array[i] - fx) > 100. * CEED_EPSILON) {
+      if (fabs(v_array[i] - fx) > 500. * CEED_EPSILON) {
         // LCOV_EXCL_START
         printf("[%" CeedInt_FMT "] %f != %f = f(%f", dim, v_array[i], fx, coord[0]);
         for (CeedInt d = 1; d < dim; d++) printf(", %f", coord[d]);