From 32881af0778ceef7bed14774cbb65405617428c7 Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Wed, 10 Jul 2024 03:38:10 +0200
Subject: [PATCH 1/7] Add relative positional encoding, RPE in cuda backend.

---
 libs/lczero-common                |   2 +-
 src/neural/cuda/common_kernels.cu | 160 +++++++++++++--
 src/neural/cuda/kernels.h         |   7 +
 src/neural/cuda/layers.cc         | 327 +++++++++++++++++++-----------
 src/neural/cuda/layers.h          |  31 ++-
 src/neural/cuda/network_cuda.cc   |   8 +-
 src/neural/network_legacy.cc      |   5 +-
 src/neural/network_legacy.h       |   3 +
 8 files changed, 395 insertions(+), 148 deletions(-)
diff --git a/libs/lczero-common b/libs/lczero-common
index 55e1b382ef..e05fb7a505 160000
--- a/libs/lczero-common
+++ b/libs/lczero-common
@@ -1 +1 @@
-Subproject commit 55e1b382efadd57903e37f2a2e29caef3ea85799
+Subproject commit e05fb7a505554682acc8a197eb797c26b6db161d
diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index 395bab8d84..9e058b309e 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -340,19 +340,21 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W,
   ReportCUDAErrors(cudaGetLastError());
 }
 
-template <typename dT, typename sT>
-__device__ dT readNCHW(const sT* input_tensor, int n, int c, int h, int w,
-                       int Nin, int Cin, int H, int W) {
-  if (n >= Nin || c >= Cin) return 0;
+template <typename dT, typename sT = dT>
+__device__ dT readInputTensor(const sT* input_tensor, int i, int j, int k,
+                              int l, int I, int J, int K, int L) {
+  // i is the outermost|slowest|most-significant index, while l is the
+  // innermost|fastest|least-significant index.
+  if (i >= I || j >= J || k >= K || l >= L) return 0;
 
   int index;
-  index = n;
-  index *= Cin;
-  index += c;
-  index *= H;
-  index += h;
-  index *= W;
-  index += w;
+  index = i;
+  index *= J;
+  index += j;
+  index *= K;
+  index += k;
+  index *= L;
+  index += l;
 
   return (dT)(input_tensor[index]);
 }
@@ -376,7 +378,7 @@ __global__ void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor,
   int n = index;
 
   output_tensor[tid] =
-      readNCHW<dT, sT>(input_tensor, n, c, h, w, Nin, Cin, H, W);
+      readInputTensor<dT, sT>(input_tensor, n, c, h, w, Nin, Cin, H, W);
 }
 
 template <typename DstType, typename SrcType>
@@ -1242,7 +1244,8 @@ __global__ void preprocess_for_attention_body_kernel(
   if (c >= input_size) {
     // concatenate from position encoding array
     if (is_pe_dense_embedding) {
-      op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + (c - input_size)]);
+      op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size +
+                        (c - input_size)]);
     } else {
       op = (T)(encoding[64 * hw + (c - input_size)]);
     }
@@ -1309,6 +1312,126 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add,
   ReportCUDAErrors(cudaGetLastError());
 }
 
+// Get the index corresponding to position (i,j,k,l) in a tensor
+// with dimensions (I,J,K,L) where the innermost dimension is L.
+__device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I,
+                                              int J, int K, int L) {
+  if (i >= I || j >= J || k >= K || l >= L) return -1;
+
+  int index;
+  index = i;
+  index *= J;
+  index += j;
+  index *= K;
+  index += k;
+  index *= L;
+  index += l;
+
+  return index;
+}
+
+template <typename T>
+__global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
+                                         const T* skipAdd, T* output, int B,
+                                         int H, int Q, int K, int D,
+                                         float outScale, size_t rpetype) {
+  int x = threadIdx.x + blockDim.x * blockIdx.x;
+  int q = threadIdx.y + blockDim.y * blockIdx.y;
+  int bh = threadIdx.z + blockDim.z * blockIdx.z;
+  int h = bh % H;
+  int b = bh / H;
+
+  if (rpetype == 0) {
+    // RPE-Q
+    // rpeInput:   [B, Q, H, D] -> transpose to [B, H, Q, (1, D)]
+    // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (D, K)]
+    // output:     [B, H, Q, K]
+
+    // Read tensors per the input layouts and write out per the output layout.
+    // Sum is along the D dimension, K is along the x-axis.
+    int k = x;
+    if (b >= B || h >= H || q >= Q || k >= K) return;
+
+    T sum = 0;
+    for (int i = 0; i < D; i++) {
+      sum += readInputTensor<T>(rpeInput, b, q, h, i, B, Q, H, D) *
+             readInputTensor<T>(rpeWeights, i, h, q, k, D, H, Q, K);
+    }
+    int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
+    output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale;
+  } else if (rpetype == 1) {
+    // RPE-K
+    // rpeInput:   [B, K, H, D] -> transpose to [B, H, K, (1, D)]
+    // rpeWeights: [D, H, Q, K] -> transpose to [1, H, K, (D, Q)]
+    // output:     [B, H, Q, K]
+
+    // Read tensors per the input layouts and write out per the output layout.
+    // Sum is along the D dimension, and K is along the x-axis.
+    int k = x;
+    if (b >= B || h >= H || q >= Q || k >= K) return;
+
+    T sum = 0;
+    for (int i = 0; i < D; i++) {
+      sum += readInputTensor<T>(rpeInput, b, k, h, i, B, K, H, D) *
+             readInputTensor<T>(rpeWeights, i, h, q, k, D, H, Q, K);
+    }
+    int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
+    output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale;
+  } else if (rpetype == 2) {
+    // RPE-V
+    // rpeInput:   [B, H, Q, K] -> transpose to [B, H, Q, (1, K)]
+    // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (K, D)]
+    // output:     [B, Q, H, D]
+    // The skip connection is also already in BQHD order.
+
+    // Read tensors per the input layouts and write out per the output layout.
+    // Sum is along the K dimension, and D is along the x-axis.
+    int d = x;
+    if (b >= B || h >= H || q >= Q || d >= D) return;
+
+    T sum = 0;
+    for (int i = 0; i < K; i++) {
+      sum += readInputTensor<T>(rpeInput, b, h, q, i, B, H, Q, K) *
+             readInputTensor<T>(rpeWeights, d, h, q, i, D, H, Q, K);
+    }
+    int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D);
+    output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale;
+  }
+}
+
+template <typename T>
+void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights,
+                                const T* attnInput, T* output, int B, int H,
+                                int Q, int K, int D, float outScale,
+                                size_t rpetype, cudaStream_t stream) {
+  if (rpetype > 2) {
+    throw Exception("unsupported rpetype in multiplyRPEAttentionLogits.");
+  }
+
+  // rpeType:    Q            | K            | V
+  // rpeInput:   [B, Q, H, D] | [B, K, H, D] | [B, H, Q, K]
+  // rpeWeights: [D, H, Q, K] | [D, H, Q, K] | [D, H, Q, K]
+
+  // 3D block structure where x-axis maps to K (or D for rpetype==2), y-axis to
+  // Q and z-axis to BH. Each thread calculates the vector sum-product for the
+  // cube at BHQK | BHQD (i.e. "d,dk->k" | "d,dq->q" | "k,kd->d").
+  dim3 blockDim, gridDim;
+  int X = rpetype == 2 ? D : K;
+  blockDim.x = std::min(32, X);
+  blockDim.y = std::min(32, Q);
+  blockDim.z = std::min(std::max(1024 / (blockDim.x * blockDim.y), 1u),
+                        (unsigned int)(B * H));
+  gridDim.x = DivUp(X, blockDim.x);
+  gridDim.y = DivUp(Q, blockDim.y);
+  gridDim.z = DivUp(B * H, blockDim.z);
+
+  rpeVectorMultiply_kernel<T><<<gridDim, blockDim, 0, stream>>>(
+      rpeInput, rpeWeights, attnInput, output, B, H, Q, K, D, outScale,
+      rpetype);
+
+  ReportCUDAErrors(cudaGetLastError());
+}
+
 // Template instantiation.
 template void copyTypeConverted<half, float>(half* op, float* ip, int N,
                                              cudaStream_t stream);
@@ -1595,5 +1718,16 @@ template void applyInputGating<float>(float* output, const float* input,
                                       const float* mult, const float* add,
                                       int N, int C, int output_size,
                                       cudaStream_t stream);
+
+template void multiplyRPEAttentionLogits<half>(
+    const half* rpeInput, const half* rpeWeights, const half* attnInput,
+    half* output, int B, int H, int Q, int K, int D, float outScale,
+    size_t rpetype, cudaStream_t stream);
+
+template void multiplyRPEAttentionLogits<float>(
+    const float* rpeInput, const float* rpeWeights, const float* attnInput,
+    float* output, int B, int H, int Q, int K, int D, float outScale,
+    size_t rpetype, cudaStream_t stream);
+
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h
index 483af0f4db..31db577808 100644
--- a/src/neural/cuda/kernels.h
+++ b/src/neural/cuda/kernels.h
@@ -155,5 +155,12 @@ void inputPreprocessForAttentionBody(T* output, const T* input,
 template <typename T>
 void applyInputGating(T* output, const T* input, const T* mult, const T* add,
                       int N, int HW, int C, cudaStream_t stream);
+
+template <typename T>
+void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights,
+                                const T* attnInput, T* output, int B, int H,
+                                int Q, int K, int D, float outScale,
+                                size_t rpetype, cudaStream_t stream);
+
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index 8543443897..7d0667859b 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -42,56 +42,55 @@ namespace lczero {
 #if 0
 // debug code to dump allocation in GPU memory
 template <typename T>
-void dumpTensor(T* memory, int elements, const char* message, bool only_summary = false) {
-    const bool fp16 = std::is_same<half, T>::value;
-    printf("\n%s\n", message);
-    int elementSize = (int) (fp16 ? sizeof(half) : sizeof(float));
-    int bytes = elements * elementSize;
-    void *temp = malloc(bytes);
-    cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost);
-    float maxval = -std::numeric_limits<float>::max();
-    float minval = std::numeric_limits<float>::max();
-    int nans = 0;
-    int nanss[10] {};
-
-    for (int i = 0; i < elements; i++)
-    {
-        float val;
-        if (fp16) 
-        {
-            half *arr = (half*)temp;
-            val = (float)arr[i];
-        }
-        else
-        {
-            float *arr = (float *)temp;
-            val = arr[i];
-        }
-        maxval = std::max(maxval, val);
-        minval = std::min(minval, val);
+void dumpTensor(T* memory, int elements, const char* message, int lines = -1,
+                int start = 0) {
+  const bool fp16 = std::is_same<half, T>::value;
+  printf("\n%s\n", message);
+  int elementSize = (int)(fp16 ? sizeof(half) : sizeof(float));
+  int bytes = elements * elementSize;
+  void* temp = malloc(bytes);
+  cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost);
+  float maxval = -std::numeric_limits<float>::max();
+  float minval = std::numeric_limits<float>::max();
+  int nans = 0;
+  int nanss[10]{};
+
+  for (int i = 0; i < elements; i++) {
+    float val;
+    if (fp16) {
+      half* arr = (half*)temp;
+      val = (float)arr[i];
+    } else {
+      float* arr = (float*)temp;
+      val = arr[i];
+    }
+    maxval = std::max(maxval, val);
+    minval = std::min(minval, val);
 
-        if (std::isnan(val)) {
-          if (nans < 10) nanss[nans] = i;
-          nans++;
-        }
+    if (std::isnan(val)) {
+      if (nans < 10) nanss[nans] = i;
+      nans++;
+    }
 
-        if (!only_summary || i < 2 || i == elements - 1) {
-          // printf("%8.4f ", val);
-          // if ((i % 8) == 7) printf("\n");
-          printf("%i;%.6f\n", i, val);
-        }
+    if ((i >= start && (i < start + lines || lines == -1)) ||
+        i == elements - 1) {
+      // printf("%8.4f ", val);
+      // if ((i % 8) == 7) printf("\n");
+      printf("%i;%.6f\n", i, val);
     }
-    free(temp);
-    if (maxval == -std::numeric_limits<float>::max())
-       maxval = std::numeric_limits<double>::quiet_NaN();
-    if (minval == std::numeric_limits<float>::max())
-       minval = std::numeric_limits<double>::quiet_NaN();
-
-    printf("Max: %.6f, Min: %.6f, NaNs: %i of %i", maxval, minval, nans, elements);
-    printf("\nNaN indices: ");
-    for (int i=0; i<nans && i<10; i++) printf("%i ", nanss[i]);
-    if (nans > 10) printf("......");
-    printf("\n");
+  }
+  free(temp);
+  if (maxval == -std::numeric_limits<float>::max())
+    maxval = std::numeric_limits<double>::quiet_NaN();
+  if (minval == std::numeric_limits<float>::max())
+    minval = std::numeric_limits<double>::quiet_NaN();
+
+  printf("Max: %.6f, Min: %.6f, NaNs: %i of %i", maxval, minval, nans,
+         elements);
+  printf("\nNaN indices: ");
+  for (int i = 0; i < nans && i < 10; i++) printf("%i ", nanss[i]);
+  if (nans > 10) printf("......");
+  printf("\n");
 }
 #endif
 
@@ -1419,6 +1418,68 @@ void allocAndUpload(DataType** gpu_dest, std::vector<float> cpu_src,
                     (int)cpu_src.size(), 0);
 }
 
+template <typename DataType>
+static void cublasXgemm(cublasHandle_t handle, cublasOperation_t transa,
+                        cublasOperation_t transb, int m, int n, int k,
+                        float alpha, const DataType* A, int lda,
+                        const DataType* B, int ldb, float beta, DataType* C,
+                        int ldc) {
+  const bool fp16 = std::is_same<half, DataType>::value;
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+    ReportCUBLASErrors(cublasHgemm(
+        handle, transa, transb, m, n, k, (const half*)&alpha_h, (const half*)A,
+        lda, (const half*)B, ldb, (const half*)&beta_h, (half*)C, ldc));
+  } else {
+    ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha,
+                                   (const float*)A, lda, (const float*)B, ldb,
+                                   &beta, (float*)C, ldc));
+  }
+}
+
+template <typename DataType>
+static void cublasXGemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, float alpha, const void* A, int lda,
+    long long int strideA, const void* B, int ldb, long long int strideB,
+    float beta, void* C, int ldc, long long int strideC, int batchCount) {
+  const bool fp16 = std::is_same<half, DataType>::value;
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+    ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+        handle, transa, transb, m, n, k, &alpha_h, A, CUDA_R_16F, lda, strideA,
+        B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC,
+        batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
+  } else {
+    ReportCUBLASErrors(cublasGemmStridedBatchedEx(
+        handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B,
+        CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
+        batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
+  }
+}
+
+template <typename DataType>
+static void cublasXGemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                               cublasOperation_t transb, int m, int n, int k,
+                               float alpha, DataType** A, int lda, DataType** B,
+                               int ldb, float beta, DataType** C, int ldc,
+                               int batchCount) {
+  const bool fp16 = std::is_same<half, DataType>::value;
+  if (fp16) {
+    unsigned short alpha_h = FP32toFP16(alpha);
+    unsigned short beta_h = FP32toFP16(beta);
+    ReportCUBLASErrors(cublasHgemmBatched(
+        handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda,
+        (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount));
+  } else {
+    ReportCUBLASErrors(cublasSgemmBatched(
+        handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B,
+        ldb, &beta, (float**)C, ldc, batchCount));
+  }
+}
+
 template <typename DataType>
 AttentionPolicyHead<DataType>::AttentionPolicyHead(
     BaseLayer<DataType>* ip, const MultiHeadWeights::PolicyHead& weights,
@@ -1579,67 +1640,73 @@ EncoderBlock<DataType>::EncoderBlock(
     // GPU memory already allocated in AttentionBody.
     smol_global = smolgen_global_scratch;
   }
-}
 
-template <typename DataType>
-static void cublasXgemm(cublasHandle_t handle, cublasOperation_t transa,
-                        cublasOperation_t transb, int m, int n, int k,
-                        float alpha, const DataType* A, int lda,
-                        const DataType* B, int ldb, float beta, DataType* C,
-                        int ldc) {
-  const bool fp16 = std::is_same<half, DataType>::value;
-  if (fp16) {
-    unsigned short alpha_h = FP32toFP16(alpha);
-    unsigned short beta_h = FP32toFP16(beta);
-    ReportCUBLASErrors(cublasHgemm(
-        handle, transa, transb, m, n, k, (const half*)&alpha_h, (const half*)A,
-        lda, (const half*)B, ldb, (const half*)&beta_h, (half*)C, ldc));
-  } else {
-    ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha,
-                                   (const float*)A, lda, (const float*)B, ldb,
-                                   &beta, (float*)C, ldc));
-  }
-}
+  // RPE weights.
+  if (cpu_weights.mha.rpe_q.size() > 0 || cpu_weights.mha.rpe_k.size() > 0 ||
+      cpu_weights.mha.rpe_v.size() > 0) {
+    // Weights factorizer.
+    int rows = 15 * 15;
+    int cols = 64 * 64;
+    int row, col;
+    std::vector<float> rpe_map(rows * cols);
+    // 15 * 15 in units for distance pairs to 64 * 64 pairs of squares.
+    // Distance pairs mapped on rows, while square pairs mapped on columns.
+    for (auto i = 0; i < 8; i++) {
+      for (auto j = 0; j < 8; j++) {
+        for (auto k = 0; k < 8; k++) {
+          for (auto l = 0; l < 8; l++) {
+            row = 15 * (i - k + 7) + (j - l + 7);
+            col = 64 * (i * 8 + j) + k * 8 + l;
+            rpe_map[row * cols + col] = 1.0f;
+          }
+        }
+      }
+    }
+    allocAndUpload<DataType>(&mha_rpe_factorizer, rpe_map, scratch);
 
-template <typename DataType>
-static void cublasXGemmStridedBatched(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, float alpha, const void* A, int lda,
-    long long int strideA, const void* B, int ldb, long long int strideB,
-    float beta, void* C, int ldc, long long int strideC, int batchCount) {
-  const bool fp16 = std::is_same<half, DataType>::value;
-  if (fp16) {
-    unsigned short alpha_h = FP32toFP16(alpha);
-    unsigned short beta_h = FP32toFP16(beta);
-    ReportCUBLASErrors(cublasGemmStridedBatchedEx(
-        handle, transa, transb, m, n, k, &alpha_h, A, CUDA_R_16F, lda, strideA,
-        B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC,
-        batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT));
-  } else {
-    ReportCUBLASErrors(cublasGemmStridedBatchedEx(
-        handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B,
-        CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC,
-        batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT));
-  }
-}
+    // We need a cublas instance for the gemm. Create a temporary one.
+    cublasHandle_t tmp_cublas;
+    ReportCUBLASErrors(cublasCreate(&tmp_cublas));
 
-template <typename DataType>
-static void cublasXGemmBatched(cublasHandle_t handle, cublasOperation_t transa,
-                               cublasOperation_t transb, int m, int n, int k,
-                               float alpha, DataType** A, int lda, DataType** B,
-                               int ldb, float beta, DataType** C, int ldc,
-                               int batchCount) {
-  const bool fp16 = std::is_same<half, DataType>::value;
-  if (fp16) {
-    unsigned short alpha_h = FP32toFP16(alpha);
-    unsigned short beta_h = FP32toFP16(beta);
-    ReportCUBLASErrors(cublasHgemmBatched(
-        handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda,
-        (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount));
-  } else {
-    ReportCUBLASErrors(cublasSgemmBatched(
-        handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B,
-        ldb, &beta, (float**)C, ldc, batchCount));
+    // Allocate RPE weights and multiply by factorizer
+    DataType* rpe_scratch;
+    mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size();
+    if (mha_rpe_q_size_ > 0) {
+      allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_q, scratch);
+
+      // Gemm to factorize the RPE Q weights.
+      ReportCUDAErrors(
+          cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType)));
+      cublasXgemm<DataType>(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096,
+                            mha_q_size_, 225, 1.0f, mha_rpe_factorizer, 4096,
+                            rpe_scratch, mha_q_size_, 0.0f,
+                            (DataType*)mha_rpe_q, 4096);
+    }
+    mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size();
+    if (mha_rpe_k_size_ > 0) {
+      allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_k, scratch);
+
+      // Gemm to factorize the RPE K weights.
+      ReportCUDAErrors(
+          cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType)));
+      cublasXgemm<DataType>(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096,
+                            mha_k_size_, 225, 1.0f, mha_rpe_factorizer, 4096,
+                            rpe_scratch, mha_k_size_, 0.0f,
+                            (DataType*)mha_rpe_k, 4096);
+    }
+    mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size();
+    if (mha_rpe_v_size_ > 0) {
+      allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_v, scratch);
+
+      // Gemm to factorize the RPE V weights.
+      ReportCUDAErrors(
+          cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType)));
+      cublasXgemm<DataType>(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096,
+                            mha_v_size_, 225, 1.0f, mha_rpe_factorizer, 4096,
+                            rpe_scratch, mha_v_size_, 0.0f,
+                            (DataType*)mha_rpe_v, 4096);
+    }
+    cublasDestroy(tmp_cublas);
   }
 }
 
@@ -1749,18 +1816,7 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
   // (Maybe not, we can play with strides of the gemm and do independent gemms
   // for each encoder head)
 
-  // Apply scaled dot product attention:
-  /*
-      matmul_qk = tf.matmul(q, k, transpose_b=True)
-      dk = tf.cast(tf.shape(k)[-1], self.model_dtype)
-      scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
-      attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
-      output = tf.matmul(attention_weights, v)
-  */
-
-  // shape(k)[-1] = depth
-  float factor = 1.0f / sqrt((float)depth);
-
+  // Apply dot product attention
   // matmul_qk = tf.matmul(q, k, transpose_b=True)
   {
     if (*offset_pointers == nullptr) {
@@ -1790,7 +1846,7 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/,
         depth /*K*/,  // A/B, and M/N are swapped for row-major to col-major
                       // transform
-        factor,       // to handle "/ tf.math.sqrt(dk)"
+        1.0,          // Scaling done after RPE logits
         *offset_pointers,  // mha_k + offset /*A*/,
         d_model /*LDA*/,   // (d_model = depth * encoder_heads_) to skip over
                            // other "depth" slices / heads
@@ -1808,6 +1864,30 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         N * encoder_heads_);
   }
 
+  // Scaling of attention logits is done after RPE.
+  float factor = 1.0f / sqrt((float)depth);
+  {
+    // RPE Q and K.
+    if (mha_rpe_q_size_ > 0) {
+      // Matrix-vector multiplication for query x rpe_q
+      // Note: mha_q here is not yet transposed, so shape is still BQHD.
+      // mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K]
+      // Kernel performs the required transpositions.
+      float outScale = mha_rpe_k_size_ == 0 ? factor : 1.0f;
+      multiplyRPEAttentionLogits<DataType>(mha_q, mha_rpe_q, buffer1, buffer1,
+                                           N, encoder_heads_, 64, 64, depth,
+                                           outScale, 0, stream);
+    }
+    if (mha_rpe_k_size_ > 0) {
+      // Matrix-vector multiplication for key x rpe_k
+      // Note: mha_k here is not yet transposed, so shape is still BKHD.
+      // mha_k @ rpe_k: [B, K, H, D] x [D, H, Q, K]
+      // Kernel performs the required transpositions.
+      multiplyRPEAttentionLogits<DataType>(mha_k, mha_rpe_k, buffer1, buffer1,
+                                           N, encoder_heads_, 64, 64, depth,
+                                           factor, 1, stream);
+    }
+  }
   // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
   // attention_weights -> buffer1
   if (has_smolgen_) {
@@ -1838,6 +1918,17 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         N * encoder_heads_);
   }
 
+  if (mha_rpe_v_size_ > 0) {
+    // RPE-V
+    // Matrix-vector multiplication for attn x rpe_v
+    // attn @ rpe_v: [B, H, Q, K] x [D, H, Q, K]
+    // Kernel performs the required transpositions.
+    // The matmul result (buffer2) is already with BQHD shape.
+    multiplyRPEAttentionLogits<DataType>(buffer1, mha_rpe_v, buffer2, buffer2,
+                                         N, encoder_heads_, 64, 64, depth, 1.0f,
+                                         2, stream);
+  }
+
   // #final dense layer (mha_dense), buffer2 -> buffer1
   {
     const int num_inputs = d_model;
diff --git a/src/neural/cuda/layers.h b/src/neural/cuda/layers.h
index de563f9346..8eb2172181 100644
--- a/src/neural/cuda/layers.h
+++ b/src/neural/cuda/layers.h
@@ -353,6 +353,8 @@ class EncoderBlock {
   DataType *mha_v_w, *mha_v_b;
   DataType *mha_qkv_w, *mha_qkv_b;
   DataType *mha_dense_w, *mha_dense_b;
+  DataType *mha_rpe_q, *mha_rpe_k, *mha_rpe_v;
+  DataType *mha_rpe_factorizer;
 
   DataType *ln1_gammas, *ln1_betas;
 
@@ -372,6 +374,9 @@ class EncoderBlock {
   int mha_k_size_;
   int mha_v_size_;
   int mha_dense_size_;
+  int mha_rpe_q_size_;
+  int mha_rpe_k_size_;
+  int mha_rpe_v_size_;
 
   int ffn_dense1_size_;
   int ffn_dense2_size_;
@@ -379,7 +384,7 @@ class EncoderBlock {
   int embedding_op_size_;
   int encoder_heads_;
 
-  float alpha_;  // scale to apply to skip connection add
+  float alpha_;        // scale to apply to skip connection add
   float default_eps_;  // value of epsilon where it wasn't specified in training
 
   const bool has_smolgen_;
@@ -485,13 +490,18 @@ class AttentionBody : public BaseLayer<DataType> {
 
  private:
   // GPU allocations to hold various weights used by the attention net body.
-  DataType *ip_emb_pre_w_, *ip_emb_pre_b_;  // input position preprocessing weights.
-  DataType *ip_emb_w_, *ip_emb_b_;          // "embedding" layer in net body
-  DataType *ip_emb_ln_g_, *ip_emb_ln_b_;  // input embedding layernorm gamma and beta
-  DataType *ip_mult_gate_, *ip_add_gate_;   // input gating
-  DataType *ip_emb_ffn_d1_w_, *ip_emb_ffn_d1_b_;  // input embedding FFN dense1 weights
-  DataType *ip_emb_ffn_d2_w_, *ip_emb_ffn_d2_b_;  // input embedding FFN dense2 weights
-  DataType *ip_emb_ffn_ln_g_, *ip_emb_ffn_ln_b_;  // input embedding FFN layernorm gamma and beta
+  DataType *ip_emb_pre_w_,
+      *ip_emb_pre_b_;               // input position preprocessing weights.
+  DataType *ip_emb_w_, *ip_emb_b_;  // "embedding" layer in net body
+  DataType *ip_emb_ln_g_,
+      *ip_emb_ln_b_;                       // input embedding layernorm gamma and beta
+  DataType *ip_mult_gate_, *ip_add_gate_;  // input gating
+  DataType *ip_emb_ffn_d1_w_,
+      *ip_emb_ffn_d1_b_;                   // input embedding FFN dense1 weights
+  DataType *ip_emb_ffn_d2_w_,
+      *ip_emb_ffn_d2_b_;      // input embedding FFN dense2 weights
+  DataType *ip_emb_ffn_ln_g_,
+      *ip_emb_ffn_ln_b_;      // input embedding FFN layernorm gamma and beta
   DataType *smolgen_global_;  // global smolgen weights for all encoder layers
   DataType *pos_encoding_;
   int embedding_dense_size_;
@@ -523,8 +533,8 @@ class ValueHead : public BaseLayer<DataType> {
 
  public:
   ValueHead(BaseLayer<DataType>* ip, const MultiHeadWeights::ValueHead& weights,
-            void* scratch, bool attention_body, bool wdl, ActivationFunction act,
-            int max_batch_size, bool use_gemm_ex);
+            void* scratch, bool attention_body, bool wdl,
+            ActivationFunction act, int max_batch_size, bool use_gemm_ex);
   ~ValueHead();
   void Eval(int N, DataType* output, const DataType* input,
             const DataType* input2, void* scratch, size_t scratch_size,
@@ -548,6 +558,5 @@ class ValueHead : public BaseLayer<DataType> {
   ActivationFunction act_;
 };
 
-
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc
index cf67d1336c..420b92d1fb 100644
--- a/src/neural/cuda/network_cuda.cc
+++ b/src/neural/cuda/network_cuda.cc
@@ -121,8 +121,8 @@ static size_t getMaxAttentionBodySize(const MultiHeadWeights& weights, int N) {
 template <typename DataType>
 class CudaNetworkComputation : public NetworkComputation {
  public:
-  CudaNetworkComputation(CudaNetwork<DataType>* network,
-                         bool wdl, bool moves_left);
+  CudaNetworkComputation(CudaNetwork<DataType>* network, bool wdl,
+                         bool moves_left);
   ~CudaNetworkComputation();
 
   void AddInput(InputPlanes&& input) override {
@@ -530,8 +530,8 @@ class CudaNetwork : public Network {
              pblczero::NetworkFormat::VALUE_WDL;
       BaseLayer<DataType>* lastlayer = attn_body_ ? encoder_last_ : resi_last_;
       auto value_main = std::make_unique<ValueHead<DataType>>(
-          lastlayer, head, scratch_mem_, attn_body_, wdl_, act,
-          max_batch_size_, use_gemm_ex);
+          lastlayer, head, scratch_mem_, attn_body_, wdl_, act, max_batch_size_,
+          use_gemm_ex);
       network_.emplace_back(std::move(value_main));
     }
 
diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc
index 53846353c6..44f79c4517 100644
--- a/src/neural/network_legacy.cc
+++ b/src/neural/network_legacy.cc
@@ -142,7 +142,10 @@ BaseWeights::MHA::MHA(const pblczero::Weights::MHA& mha)
       dense_w(LayerAdapter(mha.dense_w()).as_vector()),
       dense_b(LayerAdapter(mha.dense_b()).as_vector()),
       smolgen(Smolgen(mha.smolgen())),
-      has_smolgen(mha.has_smolgen()) {}
+      has_smolgen(mha.has_smolgen()),
+      rpe_q(LayerAdapter(mha.rpe_q()).as_vector()),
+      rpe_k(LayerAdapter(mha.rpe_k()).as_vector()),
+      rpe_v(LayerAdapter(mha.rpe_v()).as_vector()) {}
 
 BaseWeights::FFN::FFN(const pblczero::Weights::FFN& ffn)
     : dense1_w(LayerAdapter(ffn.dense1_w()).as_vector()),
diff --git a/src/neural/network_legacy.h b/src/neural/network_legacy.h
index 72ce67544f..7ad5db82ba 100644
--- a/src/neural/network_legacy.h
+++ b/src/neural/network_legacy.h
@@ -81,6 +81,9 @@ struct BaseWeights {
     Vec dense_b;
     Smolgen smolgen;
     bool has_smolgen;
+    Vec rpe_q;
+    Vec rpe_k;
+    Vec rpe_v;
   };
 
   struct FFN {

From 9614d656c5a051e04a5a8fa701de6ac1ca02cfe5 Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Wed, 10 Jul 2024 04:11:17 +0200
Subject: [PATCH 2/7] Fix attention scaling factor for non-rpe nets.

---
 src/neural/cuda/common_kernels.cu |  6 +++---
 src/neural/cuda/layers.cc         | 21 ++++++++++++++++-----
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index 9e058b309e..d7c61197df 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -1348,7 +1348,7 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     // output:     [B, H, Q, K]
 
     // Read tensors per the input layouts and write out per the output layout.
-    // Sum is along the D dimension, K is along the x-axis.
+    // Sum is along the D dimension, and K is on the x-axis.
     int k = x;
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
@@ -1366,7 +1366,7 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     // output:     [B, H, Q, K]
 
     // Read tensors per the input layouts and write out per the output layout.
-    // Sum is along the D dimension, and K is along the x-axis.
+    // Sum is along the D dimension, and K is on the x-axis.
     int k = x;
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
@@ -1385,7 +1385,7 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     // The skip connection is also already in BQHD order.
 
     // Read tensors per the input layouts and write out per the output layout.
-    // Sum is along the K dimension, and D is along the x-axis.
+    // Sum is along the K dimension, and D is on the x-axis.
     int d = x;
     if (b >= B || h >= H || q >= Q || d >= D) return;
 
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index 7d0667859b..a7a930d29b 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -1641,7 +1641,7 @@ EncoderBlock<DataType>::EncoderBlock(
     smol_global = smolgen_global_scratch;
   }
 
-  // RPE weights.
+  // RPE weights. /*
   if (cpu_weights.mha.rpe_q.size() > 0 || cpu_weights.mha.rpe_k.size() > 0 ||
       cpu_weights.mha.rpe_v.size() > 0) {
     // Weights factorizer.
@@ -1816,7 +1816,18 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
   // (Maybe not, we can play with strides of the gemm and do independent gemms
   // for each encoder head)
 
-  // Apply dot product attention
+  // Apply scaled dot product attention:
+  /*
+      matmul_qk = tf.matmul(q, k, transpose_b=True)
+      dk = tf.cast(tf.shape(k)[-1], self.model_dtype)
+      scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+      attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
+      output = tf.matmul(attention_weights, v)
+  */
+
+  // shape(k)[-1] = depth
+  float factor = 1.0f / sqrt((float)depth);
+
   // matmul_qk = tf.matmul(q, k, transpose_b=True)
   {
     if (*offset_pointers == nullptr) {
@@ -1846,7 +1857,9 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/,
         depth /*K*/,  // A/B, and M/N are swapped for row-major to col-major
                       // transform
-        1.0,          // Scaling done after RPE logits
+        mha_rpe_q_size_ > 0 || mha_rpe_k_size_ > 0
+            ? 1.0f
+            : factor,      // in RPE nets, scaling is done after RPE logits
         *offset_pointers,  // mha_k + offset /*A*/,
         d_model /*LDA*/,   // (d_model = depth * encoder_heads_) to skip over
                            // other "depth" slices / heads
@@ -1864,8 +1877,6 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         N * encoder_heads_);
   }
 
-  // Scaling of attention logits is done after RPE.
-  float factor = 1.0f / sqrt((float)depth);
   {
     // RPE Q and K.
     if (mha_rpe_q_size_ > 0) {

From 2ee1637780a55891c4c268ce443f3730f8851b3d Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Mon, 15 Jul 2024 02:07:20 +0200
Subject: [PATCH 3/7] Minor optimization of kernel. Not much perf diff.

---
 src/neural/cuda/common_kernels.cu | 106 ++++++++++++++++++++----------
 1 file changed, 71 insertions(+), 35 deletions(-)

diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index d7c61197df..5fdb5a728a 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -340,21 +340,19 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W,
   ReportCUDAErrors(cudaGetLastError());
 }
 
-template <typename dT, typename sT = dT>
-__device__ dT readInputTensor(const sT* input_tensor, int i, int j, int k,
-                              int l, int I, int J, int K, int L) {
-  // i is the outermost|slowest|most-significant index, while l is the
-  // innermost|fastest|least-significant index.
-  if (i >= I || j >= J || k >= K || l >= L) return 0;
+template <typename dT, typename sT>
+__device__ dT readNCHW(const sT* input_tensor, int n, int c, int h, int w,
+                       int Nin, int Cin, int H, int W) {
+  if (n >= Nin || c >= Cin) return 0;
 
   int index;
-  index = i;
-  index *= J;
-  index += j;
-  index *= K;
-  index += k;
-  index *= L;
-  index += l;
+  index = n;
+  index *= Cin;
+  index += c;
+  index *= H;
+  index += h;
+  index *= W;
+  index += w;
 
   return (dT)(input_tensor[index]);
 }
@@ -378,7 +376,7 @@ __global__ void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor,
   int n = index;
 
   output_tensor[tid] =
-      readInputTensor<dT, sT>(input_tensor, n, c, h, w, Nin, Cin, H, W);
+      readNCHW<dT, sT>(input_tensor, n, c, h, w, Nin, Cin, H, W);
 }
 
 template <typename DstType, typename SrcType>
@@ -1318,16 +1316,40 @@ __device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I,
                                               int J, int K, int L) {
   if (i >= I || j >= J || k >= K || l >= L) return -1;
 
-  int index;
-  index = i;
-  index *= J;
-  index += j;
-  index *= K;
-  index += k;
-  index *= L;
-  index += l;
-
-  return index;
+  return  (((((i * J) + j) * K) + k) * L) + l;
+
+  // int index;
+  // index = i;
+  // index *= J;
+  // index += j;
+  // index *= K;
+  // index += k;
+  // index *= L;
+  // index += l;
+
+
+  // return index;
+}
+
+template <typename dT, typename sT = dT>
+__device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i, size_t j, size_t k,
+                              size_t l, size_t I, size_t J, size_t K, size_t L) {
+  // i is the outermost|slowest|most-significant index, while l is the
+  // innermost|fastest|least-significant index.
+  if (i >= I || j >= J || k >= K || l >= L) return 0;
+
+  // int index;
+  // index = i;
+  // index *= J;
+  // index += j;
+  // index *= K;
+  // index += k;
+  // index *= L;
+  // index += l;
+  // int index = getTensorIndex(i, j, k, l, I, J, K, L);
+  size_t index = (((((i * J) + j) * K) + k) * L) + l;
+
+  return (dT)(input_tensor[index]);
 }
 
 template <typename T>
@@ -1353,12 +1375,17 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
     T sum = 0;
-    for (int i = 0; i < D; i++) {
-      sum += readInputTensor<T>(rpeInput, b, q, h, i, B, Q, H, D) *
-             readInputTensor<T>(rpeWeights, i, h, q, k, D, H, Q, K);
+    const T* ip = rpeInput + getTensorIndex(b, q, h, 0, B, Q, H, D);
+    const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K);
+    const int wtstep = H * Q * K;
+
+    for (int i = 0, j = 0; i < D; i++, j += wtstep) {
+      sum += ip[i] * wt[j];
+      // sum += readInputTensor<T>(rpeInput, b, q, h, i, B, Q, H, D) *
+      //        readInputTensor<T>(rpeWeights, i, h, q, k, D, H, Q, K);
     }
     int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
-    output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale;
+    output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
   } else if (rpetype == 1) {
     // RPE-K
     // rpeInput:   [B, K, H, D] -> transpose to [B, H, K, (1, D)]
@@ -1371,12 +1398,17 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
     T sum = 0;
-    for (int i = 0; i < D; i++) {
-      sum += readInputTensor<T>(rpeInput, b, k, h, i, B, K, H, D) *
-             readInputTensor<T>(rpeWeights, i, h, q, k, D, H, Q, K);
+    const T* ip = rpeInput + getTensorIndex(b, k, h, 0, B, K, H, D);
+    const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K);
+    const int wtstep = H * Q * K;
+
+    for (int i = 0, j = 0; i < D; i++, j += wtstep) {
+      sum += ip[i] * wt[j];
+      // sum += readInputTensor<T>(rpeInput, b, k, h, 0, B, K, H, D) *
+      //        readInputTensor<T>(rpeWeights, 0, h, q, k, D, H, Q, K);
     }
     int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
-    output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale;
+    output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
   } else if (rpetype == 2) {
     // RPE-V
     // rpeInput:   [B, H, Q, K] -> transpose to [B, H, Q, (1, K)]
@@ -1390,12 +1422,16 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     if (b >= B || h >= H || q >= Q || d >= D) return;
 
     T sum = 0;
+    const T* ip = rpeInput + getTensorIndex(b, h, q, 0, B, H, Q, K);
+    const T* wt = rpeWeights + getTensorIndex(d, h, q, 0, D, H, Q, K);
+
     for (int i = 0; i < K; i++) {
-      sum += readInputTensor<T>(rpeInput, b, h, q, i, B, H, Q, K) *
-             readInputTensor<T>(rpeWeights, d, h, q, i, D, H, Q, K);
+      sum += ip[i] * wt[i];
+      // sum += readInputTensor<T>(rpeInput, b, h, q, 0, B, H, Q, K) *
+      //        readInputTensor<T>(rpeWeights, d, h, q, 0, D, H, Q, K);
     }
     int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D);
-    output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale;
+    output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
   }
 }
 

From 506a4c4647792e954b4ccec8c6ebd411ab9c4cff Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Sun, 21 Jul 2024 05:26:43 +0200
Subject: [PATCH 4/7] Improved speed by optimizing rpe kernel. Still not enough
 yet.

---
 src/neural/cuda/common_kernels.cu | 145 ++++++++++++++++++++++--------
 src/neural/cuda/kernels.h         |   4 +
 src/neural/cuda/layers.cc         |  39 +++++---
 3 files changed, 141 insertions(+), 47 deletions(-)

diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index 5fdb5a728a..6ddfdbf49f 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -1316,7 +1316,7 @@ __device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I,
                                               int J, int K, int L) {
   if (i >= I || j >= J || k >= K || l >= L) return -1;
 
-  return  (((((i * J) + j) * K) + k) * L) + l;
+  return (((((i * J) + j) * K) + k) * L) + l;
 
   // int index;
   // index = i;
@@ -1327,13 +1327,14 @@ __device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I,
   // index *= L;
   // index += l;
 
-
   // return index;
 }
 
 template <typename dT, typename sT = dT>
-__device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i, size_t j, size_t k,
-                              size_t l, size_t I, size_t J, size_t K, size_t L) {
+__device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i,
+                                              size_t j, size_t k, size_t l,
+                                              size_t I, size_t J, size_t K,
+                                              size_t L) {
   // i is the outermost|slowest|most-significant index, while l is the
   // innermost|fastest|least-significant index.
   if (i >= I || j >= J || k >= K || l >= L) return 0;
@@ -1352,6 +1353,39 @@ __device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i,
   return (dT)(input_tensor[index]);
 }
 
+template <typename T>
+__device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length,
+                                           bool fp16) {
+  T sum = 0;
+  // Load from memory (16 elements a time)
+  if (fp16) {
+    half u[8];
+    half v[8];
+#pragma unroll
+    for (int h = 0; h < length; h += 8) {
+      copyAs<uint4>(&u[0], &U[h]);
+      copyAs<uint4>(&v[0], &V[h]);
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        sum += (T)u[i] * (T)v[i];
+      }
+    }
+  } else {
+    float u[8];
+    float v[8];
+#pragma unroll
+    for (int h = 0; h < length; h += 4) {
+      copyAs<uint4>(&u[0], &U[h]);
+      copyAs<uint4>(&v[0], &V[h]);
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        sum += u[i] * v[i];
+      }
+    }
+  }
+  return sum;
+}
+
 template <typename T>
 __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
                                          const T* skipAdd, T* output, int B,
@@ -1362,11 +1396,12 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
   int bh = threadIdx.z + blockDim.z * blockIdx.z;
   int h = bh % H;
   int b = bh / H;
+  const bool fp16 = std::is_same<half, T>::value;
 
   if (rpetype == 0) {
     // RPE-Q
     // rpeInput:   [B, Q, H, D] -> transpose to [B, H, Q, (1, D)]
-    // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (D, K)]
+    // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (D, K)]
     // output:     [B, H, Q, K]
 
     // Read tensors per the input layouts and write out per the output layout.
@@ -1374,22 +1409,17 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     int k = x;
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
-    T sum = 0;
-    const T* ip = rpeInput + getTensorIndex(b, q, h, 0, B, Q, H, D);
-    const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K);
-    const int wtstep = H * Q * K;
+    const int tensorIndex = getTensorIndex(b, q, h, 0, B, Q, H, D);
+    const int weightIndex = getTensorIndex(h, q, k, 0, H, Q, K, D);
 
-    for (int i = 0, j = 0; i < D; i++, j += wtstep) {
-      sum += ip[i] * wt[j];
-      // sum += readInputTensor<T>(rpeInput, b, q, h, i, B, Q, H, D) *
-      //        readInputTensor<T>(rpeWeights, i, h, q, k, D, H, Q, K);
-    }
+    T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D,
+                          fp16);
     int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
     output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
   } else if (rpetype == 1) {
     // RPE-K
     // rpeInput:   [B, K, H, D] -> transpose to [B, H, K, (1, D)]
-    // rpeWeights: [D, H, Q, K] -> transpose to [1, H, K, (D, Q)]
+    // rpeWeights: [H, K, Q, D] -> transpose to [1, H, K, (D, Q)]
     // output:     [B, H, Q, K]
 
     // Read tensors per the input layouts and write out per the output layout.
@@ -1397,22 +1427,16 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     int k = x;
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
-    T sum = 0;
-    const T* ip = rpeInput + getTensorIndex(b, k, h, 0, B, K, H, D);
-    const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K);
-    const int wtstep = H * Q * K;
-
-    for (int i = 0, j = 0; i < D; i++, j += wtstep) {
-      sum += ip[i] * wt[j];
-      // sum += readInputTensor<T>(rpeInput, b, k, h, 0, B, K, H, D) *
-      //        readInputTensor<T>(rpeWeights, 0, h, q, k, D, H, Q, K);
-    }
+    const int tensorIndex = getTensorIndex(b, k, h, 0, B, K, H, D);
+    const int weightIndex = getTensorIndex(h, k, q, 0, H, K, Q, D);
+    T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D,
+                          fp16);
     int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
     output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
   } else if (rpetype == 2) {
     // RPE-V
     // rpeInput:   [B, H, Q, K] -> transpose to [B, H, Q, (1, K)]
-    // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (K, D)]
+    // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (K, D)]
     // output:     [B, Q, H, D]
     // The skip connection is also already in BQHD order.
 
@@ -1421,15 +1445,10 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     int d = x;
     if (b >= B || h >= H || q >= Q || d >= D) return;
 
-    T sum = 0;
-    const T* ip = rpeInput + getTensorIndex(b, h, q, 0, B, H, Q, K);
-    const T* wt = rpeWeights + getTensorIndex(d, h, q, 0, D, H, Q, K);
-
-    for (int i = 0; i < K; i++) {
-      sum += ip[i] * wt[i];
-      // sum += readInputTensor<T>(rpeInput, b, h, q, 0, B, H, Q, K) *
-      //        readInputTensor<T>(rpeWeights, d, h, q, 0, D, H, Q, K);
-    }
+    const int tensorIndex = getTensorIndex(b, h, q, 0, B, H, Q, K);
+    const int weightIndex = getTensorIndex(h, q, d, 0, H, Q, D, K);
+    T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, K,
+                          fp16);
     int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D);
     output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
   }
@@ -1468,6 +1487,54 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights,
   ReportCUDAErrors(cudaGetLastError());
 }
 
+template <typename T>
+__global__ void permuteTensor_kernel(T* output, const T* input, int s1, int s2,
+                                     int s3, int s4, int p1, int p2, int p3,
+                                     int p4) {
+  // Shared memory for shape of tensor.
+  __shared__ int tshp[4];
+  if (threadIdx.x == 0) {
+    tshp[0] = s1;
+    tshp[1] = s2;
+    tshp[2] = s3;
+    tshp[3] = s4;
+  }
+  __syncthreads();
+
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (tid >= s1 * s2 * s3 * s4) return;
+
+  int tloc[] = {0, 0, 0, 0};
+  int index = tid;
+
+  tloc[3] = (index % s4);
+  index /= s4;
+  tloc[2] = index % s3;
+  index /= s3;
+  tloc[1] = index % s2;
+  index /= s2;
+  tloc[0] = index;
+
+  int outIdx = (((((tloc[p1] * tshp[p2]) + tloc[p2]) * tshp[p3]) + tloc[p3]) *
+                tshp[p4]) +
+               tloc[p4];
+  output[outIdx] = (T)input[tid];
+}
+
+template <typename T>
+void permuteTensor(T* output, const T* input, int s1, int s2, int s3, int s4,
+                   int p1, int p2, int p3, int p4, cudaStream_t stream) {
+  // The order and bounds arrays are assumed to have 4 elements.
+  int elements = s1 * s2 * s3 * s4;
+  const int kBlockSize = 1024;
+  int blocks = DivUp(elements, kBlockSize);
+
+  permuteTensor_kernel<<<blocks, kBlockSize, 0, stream>>>(
+      output, input, s1, s2, s3, s4, p1, p2, p3, p4);
+  ReportCUDAErrors(cudaGetLastError());
+}
+
 // Template instantiation.
 template void copyTypeConverted<half, float>(half* op, float* ip, int N,
                                              cudaStream_t stream);
@@ -1765,5 +1832,13 @@ template void multiplyRPEAttentionLogits<float>(
     float* output, int B, int H, int Q, int K, int D, float outScale,
     size_t rpetype, cudaStream_t stream);
 
+template void permuteTensor<half>(half* output, const half* input, int s1,
+                                  int s2, int s3, int s4, int p1, int p2,
+                                  int p3, int p4, cudaStream_t stream);
+
+template void permuteTensor<float>(float* output, const float* input, int s1,
+                                   int s2, int s3, int s4, int p1, int p2,
+                                   int p3, int p4, cudaStream_t stream);
+
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h
index 31db577808..15571b6acf 100644
--- a/src/neural/cuda/kernels.h
+++ b/src/neural/cuda/kernels.h
@@ -162,5 +162,9 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights,
                                 int Q, int K, int D, float outScale,
                                 size_t rpetype, cudaStream_t stream);
 
+template <typename T>
+void permuteTensor(T* output, const T* input, int s1, int s2, int s3, int s4,
+                   int p1, int p2, int p3, int p4, cudaStream_t stream);
+
 }  // namespace cudnn_backend
 }  // namespace lczero
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index a7a930d29b..0efe24c8b3 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -1670,41 +1670,56 @@ EncoderBlock<DataType>::EncoderBlock(
 
     // Allocate RPE weights and multiply by factorizer
     DataType* rpe_scratch;
+    int heads = encoder_heads_;
+    int depth = mha_q_size_ / encoder_heads_;
     mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size();
     if (mha_rpe_q_size_ > 0) {
       allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_q, scratch);
 
       // Gemm to factorize the RPE Q weights.
-      ReportCUDAErrors(
-          cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType)));
       cublasXgemm<DataType>(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096,
                             mha_q_size_, 225, 1.0f, mha_rpe_factorizer, 4096,
-                            rpe_scratch, mha_q_size_, 0.0f,
-                            (DataType*)mha_rpe_q, 4096);
+                            rpe_scratch, mha_q_size_, 0.0f, (DataType*)scratch,
+                            4096);
+
+      // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, D, K]
+      // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, K, D]
+      ReportCUDAErrors(
+          cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType)));
+      permuteTensor((DataType*)mha_rpe_q, (const DataType*)scratch, depth, heads, 64,
+                    64, 1, 2, 3, 0, 0);
     }
     mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size();
     if (mha_rpe_k_size_ > 0) {
       allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_k, scratch);
 
       // Gemm to factorize the RPE K weights.
-      ReportCUDAErrors(
-          cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType)));
       cublasXgemm<DataType>(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096,
                             mha_k_size_, 225, 1.0f, mha_rpe_factorizer, 4096,
-                            rpe_scratch, mha_k_size_, 0.0f,
-                            (DataType*)mha_rpe_k, 4096);
+                            rpe_scratch, mha_k_size_, 0.0f, (DataType*)scratch,
+                            4096);
+
+      // Permute RPE K weights: [D, H, Q, K] -> [H, K, D, Q]
+      ReportCUDAErrors(
+          cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType)));
+      permuteTensor((DataType*)mha_rpe_k, (const DataType*)scratch, depth,
+                    heads, 64, 64, 1, 3, 2, 0, 0);
     }
     mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size();
     if (mha_rpe_v_size_ > 0) {
       allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_v, scratch);
 
       // Gemm to factorize the RPE V weights.
-      ReportCUDAErrors(
-          cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType)));
       cublasXgemm<DataType>(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096,
                             mha_v_size_, 225, 1.0f, mha_rpe_factorizer, 4096,
-                            rpe_scratch, mha_v_size_, 0.0f,
-                            (DataType*)mha_rpe_v, 4096);
+                            rpe_scratch, mha_v_size_, 0.0f, (DataType*)scratch,
+                            4096);
+
+      // Permute RPE V weights: [D, H, Q, K] -> [H, Q, K, D]
+      ReportCUDAErrors(
+          cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType)));
+      permuteTensor((DataType*)mha_rpe_v, (const DataType*)scratch, depth,
+                    heads, 64, 64, 1, 2, 0, 3, 0);
     }
     cublasDestroy(tmp_cublas);
   }

From 70d267287aa33dfa9c282b1a14b3dda3343aa870 Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Sun, 21 Jul 2024 20:15:00 +0200
Subject: [PATCH 5/7] Fix bugs in fp32 and non-rpe code paths.

---
 src/neural/cuda/common_kernels.cu |  2 +-
 src/neural/cuda/layers.cc         | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index 6ddfdbf49f..9337e26bea 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -1378,7 +1378,7 @@ __device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length,
       copyAs<uint4>(&u[0], &U[h]);
       copyAs<uint4>(&v[0], &V[h]);
 #pragma unroll
-      for (int i = 0; i < 8; i++) {
+      for (int i = 0; i < 4; i++) {
         sum += u[i] * v[i];
       }
     }
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index 0efe24c8b3..e610214870 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -1641,9 +1641,12 @@ EncoderBlock<DataType>::EncoderBlock(
     smol_global = smolgen_global_scratch;
   }
 
-  // RPE weights. /*
-  if (cpu_weights.mha.rpe_q.size() > 0 || cpu_weights.mha.rpe_k.size() > 0 ||
-      cpu_weights.mha.rpe_v.size() > 0) {
+  // RPE weights.
+  mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size();
+  mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size();
+  mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size();
+
+  if (mha_rpe_q_size_ > 0 || mha_rpe_k_size_ > 0 || mha_rpe_v_size_ > 0) {
     // Weights factorizer.
     int rows = 15 * 15;
     int cols = 64 * 64;
@@ -1672,7 +1675,6 @@ EncoderBlock<DataType>::EncoderBlock(
     DataType* rpe_scratch;
     int heads = encoder_heads_;
     int depth = mha_q_size_ / encoder_heads_;
-    mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size();
     if (mha_rpe_q_size_ > 0) {
       allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_q, scratch);
 
@@ -1686,10 +1688,9 @@ EncoderBlock<DataType>::EncoderBlock(
       // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, K, D]
       ReportCUDAErrors(
           cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType)));
-      permuteTensor((DataType*)mha_rpe_q, (const DataType*)scratch, depth, heads, 64,
-                    64, 1, 2, 3, 0, 0);
+      permuteTensor((DataType*)mha_rpe_q, (const DataType*)scratch, depth,
+                    heads, 64, 64, 1, 2, 3, 0, 0);
     }
-    mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size();
     if (mha_rpe_k_size_ > 0) {
       allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_k, scratch);
 
@@ -1705,7 +1706,6 @@ EncoderBlock<DataType>::EncoderBlock(
       permuteTensor((DataType*)mha_rpe_k, (const DataType*)scratch, depth,
                     heads, 64, 64, 1, 3, 2, 0, 0);
     }
-    mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size();
     if (mha_rpe_v_size_ > 0) {
       allocAndUpload<DataType>(&rpe_scratch, cpu_weights.mha.rpe_v, scratch);
 

From 75a099fe36b74f908c6bb8b9386fe9327428c028 Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Mon, 2 Sep 2024 14:13:50 +0200
Subject: [PATCH 6/7] Split kernel dot product workload among multiple threads
 in warp

---
 src/neural/cuda/common_kernels.cu | 226 +++++++++++++++++++++++++-----
 src/neural/cuda/layers.cc         |  58 ++++++--
 2 files changed, 237 insertions(+), 47 deletions(-)

diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index 9337e26bea..2d410f3b9b 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -37,6 +37,7 @@ namespace lczero {
 namespace cudnn_backend {
 namespace {
 constexpr int kInputPlanes = 112;
+constexpr int kNumRpeKernelSplits = 2;
 }  // namespace
 
 /////////////////////////////////////////////////////////////////////////////
@@ -1353,16 +1354,123 @@ __device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i,
   return (dT)(input_tensor[index]);
 }
 
+__device__ __forceinline__ float sharedDotProductSum(float val, int x, int y) {
+  // Sum is done along the x-axis, while y is the accumulator.
+  int warpPos = y % 32;
+  __shared__ float partialSum[32];
+  if (x == 0) partialSum[warpPos] = 0.0f;
+
+  __syncthreads();
+
+  // Get warp-wide sum.
+  float warpSum = warpReduce(val);
+  if (x == 0) atomicAdd(&partialSum[warpPos], warpSum);
+
+  __syncthreads();
+
+  return partialSum[warpPos];
+}
+
+template <typename T>
+__global__ void rpeVectorMultiply_parallel_kernel(
+    const T* rpeInput, const T* rpeWeights, const T* skipAdd, T* output, int B,
+    int H, int Q, int K, int D, float outScale, size_t rpetype) {
+  int x = threadIdx.x;
+  int y = threadIdx.y + blockDim.y * blockIdx.y;
+  int z = threadIdx.z + blockDim.z * blockIdx.z;
+  int h = z % H;
+  z = z / H;
+  int q = z % Q;
+  z = z / Q;
+  int b = z % B;
+
+  if (rpetype == 0) {
+    // RPE-Q
+    // rpeInput:   [B, Q, H, D] -> transpose to [B, H, Q, (1, D)]
+    // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (D, K)]
+    // output:     [B, H, Q, K]
+
+    // Read tensors per the input layouts and write out per the output layout.
+    // Sum is along the D dimension, and K is on the x-axis.
+    // Each thread handles one product of the sum. Thread 0 sums the products.
+    int d = x;
+    int k = y;
+    if (b >= B || h >= H || q >= Q || k >= K || d >= D) return;
+
+    const int tensorIndex = getTensorIndex(b, q, h, d, B, Q, H, D);
+    const int weightIndex = getTensorIndex(h, q, k, d, H, Q, K, D);
+
+    T sum = (T)sharedDotProductSum(
+        (float)rpeInput[tensorIndex] * (float)rpeWeights[weightIndex], x, y);
+
+    if (d == 0) {
+      int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
+      output[outIdx] = (sum + (T)skipAdd[outIdx]) * (T)outScale;
+    }
+  } else if (rpetype == 1) {
+    // RPE-K
+    // rpeInput:   [B, K, H, D] -> transpose to [B, H, K, (1, D)]
+    // rpeWeights: [H, K, Q, D] -> transpose to [1, H, K, (D, Q)]
+    // output:     [B, H, Q, K]
+
+    // Read tensors per the input layouts and write out per the output layout.
+    // Sum is along the D dimension, and K is on the x-axis.
+    // Each thread handles one product of the sum. Thread 0 sums the products.
+    int d = x;
+    int k = y;
+    if (b >= B || h >= H || q >= Q || k >= K || d >= D) return;
+
+    const int tensorIndex = getTensorIndex(b, k, h, d, B, K, H, D);
+    const int weightIndex = getTensorIndex(h, k, q, d, H, K, Q, D);
+
+    T sum = (T)sharedDotProductSum(
+        (float)rpeInput[tensorIndex] * (float)rpeWeights[weightIndex], x, y);
+
+    if (d == 0) {
+      int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
+      output[outIdx] = (sum + (T)skipAdd[outIdx]) * (T)outScale;
+    }
+  } else if (rpetype == 2) {
+    // RPE-V
+    // rpeInput:   [B, H, Q, K] -> transpose to [B, H, Q, (1, K)]
+    // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (K, D)]
+    // output:     [B, Q, H, D]
+    // The skip connection is also already in BQHD order.
+
+    // Read tensors per the input layouts and write out per the output layout.
+    // Sum is along the K dimension, and D is on the x-axis.
+    int k = x;
+    int d = y;
+    if (b >= B || h >= H || q >= Q || k >= K || d >= D) return;
+
+    const int tensorIndex = getTensorIndex(b, h, q, k, B, H, Q, K);
+    const int weightIndex = getTensorIndex(h, q, d, k, H, Q, D, K);
+
+    T sum = (T)sharedDotProductSum(
+        (float)rpeInput[tensorIndex] * (float)rpeWeights[weightIndex], x, y);
+
+    if (k == 0) {
+      int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D);
+      output[outIdx] = (sum + (T)skipAdd[outIdx]) * (T)outScale;
+    }
+  }
+}
+
 template <typename T>
-__device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length,
-                                           bool fp16) {
+__device__ __forceinline__ T dotProductSum(int x, const T* U, const T* V,
+                                           int length, bool fp16) {
+  assert(length >= 16);
   T sum = 0;
+  int sublen = length / kNumRpeKernelSplits;
+  int lane = x & (kNumRpeKernelSplits - 1);
+  int start = lane * sublen;
+
   // Load from memory (16 elements a time)
   if (fp16) {
     half u[8];
     half v[8];
 #pragma unroll
-    for (int h = 0; h < length; h += 8) {
+    for (int h = start; h < start + sublen; h += 8) {
       copyAs<uint4>(&u[0], &U[h]);
       copyAs<uint4>(&v[0], &V[h]);
 #pragma unroll
@@ -1371,18 +1479,25 @@ __device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length,
       }
     }
   } else {
-    float u[8];
-    float v[8];
+    float u[4];
+    float v[4];
 #pragma unroll
-    for (int h = 0; h < length; h += 4) {
+    for (int h = start; h < start + sublen; h += 4) {
       copyAs<uint4>(&u[0], &U[h]);
       copyAs<uint4>(&v[0], &V[h]);
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        sum += u[i] * v[i];
+        sum += (T)u[i] * (T)v[i];
       }
     }
   }
+
+  // Warp-level reduction to sum up adjacent threads.
+  __syncwarp();
+#pragma unroll
+  for (int i = 1; i < kNumRpeKernelSplits; i = i << 1) {
+    sum += __shfl_down_sync(0xffffffff, sum, i);
+  }
   return sum;
 }
 
@@ -1391,31 +1506,48 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
                                          const T* skipAdd, T* output, int B,
                                          int H, int Q, int K, int D,
                                          float outScale, size_t rpetype) {
-  int x = threadIdx.x + blockDim.x * blockIdx.x;
-  int q = threadIdx.y + blockDim.y * blockIdx.y;
-  int bh = threadIdx.z + blockDim.z * blockIdx.z;
-  int h = bh % H;
-  int b = bh / H;
+  const int x = threadIdx.x + blockDim.x * blockIdx.x;
+  const int q = threadIdx.y + blockDim.y * blockIdx.y;
+  const int bh = threadIdx.z + blockDim.z * blockIdx.z;
+  const int h = bh % H;
+  const int b = bh / H;
   const bool fp16 = std::is_same<half, T>::value;
+  const int lane = x & (kNumRpeKernelSplits - 1);
 
   if (rpetype == 0) {
     // RPE-Q
     // rpeInput:   [B, Q, H, D] -> transpose to [B, H, Q, (1, D)]
-    // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (D, K)]
+    // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (D, K)]
     // output:     [B, H, Q, K]
 
     // Read tensors per the input layouts and write out per the output layout.
     // Sum is along the D dimension, and K is on the x-axis.
-    int k = x;
+    int k = x / kNumRpeKernelSplits;
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
     const int tensorIndex = getTensorIndex(b, q, h, 0, B, Q, H, D);
     const int weightIndex = getTensorIndex(h, q, k, 0, H, Q, K, D);
 
-    T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D,
-                          fp16);
-    int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
-    output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
+    T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex,
+                          D, fp16);
+
+    // if (b == 0 && h == 0 && q == 0 && k == 0) {
+    //   // if (lane == 0) {
+    //     // sum += __shfl_down_sync(0xffffffff, sum, 1);
+    //     // sum = __shfl_down_sync(0xffffffff, sum, 1);
+    //     // return sum;
+    //     // printf("\n");
+    //     for (int i = 0; i < 32; i++) {
+    //       printf("x: %i, lane: %i, thread warp id: %d, sum: %f\n", x, lane,
+    //       i, (float)__shfl_sync(0xffffffff, sum, i));
+    //     }
+    //   // }
+    //   printf("x: %d, sum: %f\n", x, (float)sum);
+    // }
+    if (lane == 0) {
+      int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
+      output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
+    }
   } else if (rpetype == 1) {
     // RPE-K
     // rpeInput:   [B, K, H, D] -> transpose to [B, H, K, (1, D)]
@@ -1424,33 +1556,37 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
 
     // Read tensors per the input layouts and write out per the output layout.
     // Sum is along the D dimension, and K is on the x-axis.
-    int k = x;
+    int k = x / kNumRpeKernelSplits;
     if (b >= B || h >= H || q >= Q || k >= K) return;
 
     const int tensorIndex = getTensorIndex(b, k, h, 0, B, K, H, D);
     const int weightIndex = getTensorIndex(h, k, q, 0, H, K, Q, D);
-    T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D,
-                          fp16);
-    int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
-    output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
+    T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex,
+                          D, fp16);
+    if (lane == 0) {
+      int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
+      output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
+    }
   } else if (rpetype == 2) {
     // RPE-V
     // rpeInput:   [B, H, Q, K] -> transpose to [B, H, Q, (1, K)]
-    // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (K, D)]
+    // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (K, D)]
     // output:     [B, Q, H, D]
     // The skip connection is also already in BQHD order.
 
     // Read tensors per the input layouts and write out per the output layout.
     // Sum is along the K dimension, and D is on the x-axis.
-    int d = x;
+    int d = x / kNumRpeKernelSplits;
     if (b >= B || h >= H || q >= Q || d >= D) return;
 
     const int tensorIndex = getTensorIndex(b, h, q, 0, B, H, Q, K);
     const int weightIndex = getTensorIndex(h, q, d, 0, H, Q, D, K);
-    T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, K,
-                          fp16);
-    int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D);
-    output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
+    T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex,
+                          K, fp16);
+    if (lane == 0) {
+      int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D);
+      output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
+    }
   }
 }
 
@@ -1465,25 +1601,47 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights,
 
   // rpeType:    Q            | K            | V
   // rpeInput:   [B, Q, H, D] | [B, K, H, D] | [B, H, Q, K]
-  // rpeWeights: [D, H, Q, K] | [D, H, Q, K] | [D, H, Q, K]
+  // rpeWeights: [H, Q, K, D] | [H, K, Q, D] | [H, Q, D, K]
+  // attnInput:  [B, H, Q, K] | [B, H, Q, K] | [B, Q, H, D]
 
+  // int lda = (rpetype == 2) ? K : D;
+  // if (lda > 64 || rpetype != 0) {
+#if 1
   // 3D block structure where x-axis maps to K (or D for rpetype==2), y-axis to
   // Q and z-axis to BH. Each thread calculates the vector sum-product for the
   // cube at BHQK | BHQD (i.e. "d,dk->k" | "d,dq->q" | "k,kd->d").
   dim3 blockDim, gridDim;
   int X = rpetype == 2 ? D : K;
   blockDim.x = std::min(32, X);
-  blockDim.y = std::min(32, Q);
-  blockDim.z = std::min(std::max(1024 / (blockDim.x * blockDim.y), 1u),
+  blockDim.y = std::min(16, Q);
+  blockDim.z = std::min(std::max(512 / (blockDim.x * blockDim.y), 1u),
                         (unsigned int)(B * H));
-  gridDim.x = DivUp(X, blockDim.x);
+  gridDim.x = DivUp(X * kNumRpeKernelSplits, blockDim.x);
   gridDim.y = DivUp(Q, blockDim.y);
   gridDim.z = DivUp(B * H, blockDim.z);
 
   rpeVectorMultiply_kernel<T><<<gridDim, blockDim, 0, stream>>>(
       rpeInput, rpeWeights, attnInput, output, B, H, Q, K, D, outScale,
       rpetype);
-
+#else
+  // } else {
+  // 3D block structure where x-axis maps to D (or K for rpetype==2), y-axis
+  // maps to K (or D for rpetype==2) and z-axis to BQH. Each thread calculates
+  // the product while the warp-sum does the sum of the vector dot-product.
+  dim3 blockDim, gridDim;
+  int Y = rpetype == 2 ? D : K;
+  blockDim.x = std::min(64, lda);
+  blockDim.y = std::min(DivUp(1024, blockDim.x), Y);
+  blockDim.z = std::max(1024 / (blockDim.x * blockDim.y), 1u);
+  gridDim.x = 1;
+  gridDim.y = DivUp(Y, blockDim.y);
+  gridDim.z = DivUp(B * H * Q, blockDim.z);
+
+  rpeVectorMultiply_parallel_kernel<T><<<gridDim, blockDim, 0, stream>>>(
+      rpeInput, rpeWeights, attnInput, output, B, H, Q, K, D, outScale,
+      rpetype);
+#endif
+  // }
   ReportCUDAErrors(cudaGetLastError());
 }
 
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index e610214870..c7f1add37f 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -39,7 +39,27 @@
 
 namespace lczero {
 
-#if 0
+#if 1
+// function to calculate mean
+static float mean(float arr[], int n) {
+  float sum = 0;
+  for (int i = 0; i < n; i++) {
+    sum += arr[i];
+  }
+  return sum / n;
+}
+
+// function to calculate standard deviation
+static float stdDev(float arr[], int n) {
+  float m = mean(arr, n);  // get the mean
+  float var = 0;           // initialize variance
+  for (int i = 0; i < n; i++) {
+    var += pow(arr[i] - m, 2);  // add the squared difference from mean
+  }
+  var /= n;          // divide by number of elements
+  return sqrt(var);  // return the square root of variance
+}
+
 // debug code to dump allocation in GPU memory
 template <typename T>
 void dumpTensor(T* memory, int elements, const char* message, int lines = -1,
@@ -52,9 +72,10 @@ void dumpTensor(T* memory, int elements, const char* message, int lines = -1,
   cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost);
   float maxval = -std::numeric_limits<float>::max();
   float minval = std::numeric_limits<float>::max();
-  int nans = 0;
-  int nanss[10]{};
+  int cnans = 0;
+  int nans[10]{};
 
+  std::vector<float> fpArr(elements);
   for (int i = 0; i < elements; i++) {
     float val;
     if (fp16) {
@@ -64,12 +85,13 @@ void dumpTensor(T* memory, int elements, const char* message, int lines = -1,
       float* arr = (float*)temp;
       val = arr[i];
     }
+    fpArr[i] = val;
     maxval = std::max(maxval, val);
     minval = std::min(minval, val);
 
     if (std::isnan(val)) {
-      if (nans < 10) nanss[nans] = i;
-      nans++;
+      if (cnans < 10) nans[cnans] = i;
+      cnans++;
     }
 
     if ((i >= start && (i < start + lines || lines == -1)) ||
@@ -85,11 +107,18 @@ void dumpTensor(T* memory, int elements, const char* message, int lines = -1,
   if (minval == std::numeric_limits<float>::max())
     minval = std::numeric_limits<double>::quiet_NaN();
 
-  printf("Max: %.6f, Min: %.6f, NaNs: %i of %i", maxval, minval, nans,
-         elements);
-  printf("\nNaN indices: ");
-  for (int i = 0; i < nans && i < 10; i++) printf("%i ", nanss[i]);
-  if (nans > 10) printf("......");
+  float avg = mean(&fpArr[0], elements);
+  float stddev = stdDev(&fpArr[0], elements);
+  printf(
+      "Max: %.6f, Min: %.6f, Mean: %.6f, StdDev: %.6f\n"
+      "NaNs: %i of %i",
+      maxval, minval, avg, stddev, cnans, elements);
+
+  if (cnans > 0) {
+    printf("\nNaN indices: ");
+    for (int i = 0; i < cnans && i < 10; i++) printf("%i ", nans[i]);
+    if (cnans > 10) printf("......");
+  }
   printf("\n");
 }
 #endif
@@ -1684,7 +1713,6 @@ EncoderBlock<DataType>::EncoderBlock(
                             rpe_scratch, mha_q_size_, 0.0f, (DataType*)scratch,
                             4096);
 
-      // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, D, K]
       // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, K, D]
       ReportCUDAErrors(
           cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType)));
@@ -1700,7 +1728,7 @@ EncoderBlock<DataType>::EncoderBlock(
                             rpe_scratch, mha_k_size_, 0.0f, (DataType*)scratch,
                             4096);
 
-      // Permute RPE K weights: [D, H, Q, K] -> [H, K, D, Q]
+      // Permute RPE K weights: [D, H, Q, K] -> [H, K, Q, D]
       ReportCUDAErrors(
           cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType)));
       permuteTensor((DataType*)mha_rpe_k, (const DataType*)scratch, depth,
@@ -1715,7 +1743,7 @@ EncoderBlock<DataType>::EncoderBlock(
                             rpe_scratch, mha_v_size_, 0.0f, (DataType*)scratch,
                             4096);
 
-      // Permute RPE V weights: [D, H, Q, K] -> [H, Q, K, D]
+      // Permute RPE V weights: [D, H, Q, K] -> [H, Q, D, K]
       ReportCUDAErrors(
           cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType)));
       permuteTensor((DataType*)mha_rpe_v, (const DataType*)scratch, depth,
@@ -1903,6 +1931,8 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
       multiplyRPEAttentionLogits<DataType>(mha_q, mha_rpe_q, buffer1, buffer1,
                                            N, encoder_heads_, 64, 64, depth,
                                            outScale, 0, stream);
+      // dumpTensor((DataType*)buffer1, 64 * d_model * N, "from kernel", 8192);
+      // exit(0);
     }
     if (mha_rpe_k_size_ > 0) {
       // Matrix-vector multiplication for key x rpe_k
@@ -1953,6 +1983,8 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
     multiplyRPEAttentionLogits<DataType>(buffer1, mha_rpe_v, buffer2, buffer2,
                                          N, encoder_heads_, 64, 64, depth, 1.0f,
                                          2, stream);
+    // dumpTensor((DataType*)buffer2, 4096 * d_model * N, "from kernel", 8192);
+    // exit(0);
   }
 
   // #final dense layer (mha_dense), buffer2 -> buffer1

From 237766a90330fc33bafd566f80784d56b4bd5810 Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Sun, 8 Sep 2024 19:36:30 +0200
Subject: [PATCH 7/7] Fuse RPE-Q and RPE-K kernels. 10% speedup.

---
 src/neural/cuda/common_kernels.cu | 84 ++++++++++++++++++++++++++-----
 src/neural/cuda/kernels.h         |  7 +++
 src/neural/cuda/layers.cc         | 53 +++++++++++--------
 3 files changed, 109 insertions(+), 35 deletions(-)

diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index 2d410f3b9b..7b0bb9f277 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -1531,19 +1531,6 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights,
     T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex,
                           D, fp16);
 
-    // if (b == 0 && h == 0 && q == 0 && k == 0) {
-    //   // if (lane == 0) {
-    //     // sum += __shfl_down_sync(0xffffffff, sum, 1);
-    //     // sum = __shfl_down_sync(0xffffffff, sum, 1);
-    //     // return sum;
-    //     // printf("\n");
-    //     for (int i = 0; i < 32; i++) {
-    //       printf("x: %i, lane: %i, thread warp id: %d, sum: %f\n", x, lane,
-    //       i, (float)__shfl_sync(0xffffffff, sum, i));
-    //     }
-    //   // }
-    //   printf("x: %d, sum: %f\n", x, (float)sum);
-    // }
     if (lane == 0) {
       int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
       output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale;
@@ -1645,6 +1632,67 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights,
   ReportCUDAErrors(cudaGetLastError());
 }
 
+template <typename T>
+__global__ void rpeQK_multiply_kernel(const T* rpeInputQ, const T* rpeWeightsQ,
+                                      const T* rpeInputK, const T* rpeWeightsK,
+                                      const T* skipAdd, T* output, int B, int H,
+                                      int Q, int K, int D, float outScale) {
+  // Fused version of rpeVectorMultiply_kernel for RPE-Q and RPE-K.
+  const int x = threadIdx.x + blockDim.x * blockIdx.x;
+  const int q = threadIdx.y + blockDim.y * blockIdx.y;
+  const int bh = threadIdx.z + blockDim.z * blockIdx.z;
+  const int h = bh % H;
+  const int b = bh / H;
+  const bool fp16 = std::is_same<half, T>::value;
+  const int lane = x & (kNumRpeKernelSplits - 1);
+
+  // Read tensors per the input layouts and write out per the output layout.
+  // Sum is along the D dimension, and K is on the x-axis.
+  int k = x / kNumRpeKernelSplits;
+  if (b >= B || h >= H || q >= Q || k >= K) return;
+
+  // RPE-Q sum
+  const int tidxq = getTensorIndex(b, q, h, 0, B, Q, H, D);
+  const int widxq = getTensorIndex(h, q, k, 0, H, Q, K, D);
+  T sum1 = dotProductSum(x, rpeInputQ + tidxq, rpeWeightsQ + widxq, D, fp16);
+
+  // RPE-K sum.
+  const int tidxk = getTensorIndex(b, k, h, 0, B, K, H, D);
+  const int widxk = getTensorIndex(h, k, q, 0, H, K, Q, D);
+  T sum2 = dotProductSum(x, rpeInputK + tidxk, rpeWeightsK + widxk, D, fp16);
+
+  // Write out the result.
+  if (lane == 0) {
+    int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K);
+    output[outIdx] = (T)(sum1 + sum2 + skipAdd[outIdx]) * (T)outScale;
+  }
+}
+
+template <typename T>
+void multiplyRpeQKLogits(const T* rpeInputQ, const T* rpeWeightsQ,
+                         const T* rpeInputK, const T* rpeWeightsK,
+                         const T* attnInput, T* output, int B, int H, int Q,
+                         int K, int D, float outScale, cudaStream_t stream) {
+  // rpeType:    Q            | K
+  // rpeInput:   [B, Q, H, D] | [B, K, H, D]
+  // rpeWeights: [H, Q, K, D] | [H, K, Q, D]
+  // attnInput:  [B, H, Q, K] | [B, H, Q, K]
+  dim3 blockDim, gridDim;
+  blockDim.x = std::min(32, K);
+  blockDim.y = std::min(16, Q);
+  blockDim.z = std::min(std::max(512 / (blockDim.x * blockDim.y), 1u),
+                        (unsigned int)(B * H));
+  gridDim.x = DivUp(K * kNumRpeKernelSplits, blockDim.x);
+  gridDim.y = DivUp(Q, blockDim.y);
+  gridDim.z = DivUp(B * H, blockDim.z);
+
+  rpeQK_multiply_kernel<T><<<gridDim, blockDim, 0, stream>>>(
+      rpeInputQ, rpeWeightsQ, rpeInputK, rpeWeightsK, attnInput, output, B, H,
+      Q, K, D, outScale);
+
+  ReportCUDAErrors(cudaGetLastError());
+}
+
 template <typename T>
 __global__ void permuteTensor_kernel(T* output, const T* input, int s1, int s2,
                                      int s3, int s4, int p1, int p2, int p3,
@@ -1990,6 +2038,16 @@ template void multiplyRPEAttentionLogits<float>(
     float* output, int B, int H, int Q, int K, int D, float outScale,
     size_t rpetype, cudaStream_t stream);
 
+template void multiplyRpeQKLogits<half>(
+    const half* rpeInputQ, const half* rpeWeightsQ, const half* rpeInputK,
+    const half* rpeWeightsK, const half* attnInput, half* output, int B, int H,
+    int Q, int K, int D, float outScale, cudaStream_t stream);
+
+template void multiplyRpeQKLogits<float>(
+    const float* rpeInputQ, const float* rpeWeightsQ, const float* rpeInputK,
+    const float* rpeWeightsK, const float* attnInput, float* output, int B,
+    int H, int Q, int K, int D, float outScale, cudaStream_t stream);
+
 template void permuteTensor<half>(half* output, const half* input, int s1,
                                   int s2, int s3, int s4, int p1, int p2,
                                   int p3, int p4, cudaStream_t stream);
diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h
index 15571b6acf..966207f378 100644
--- a/src/neural/cuda/kernels.h
+++ b/src/neural/cuda/kernels.h
@@ -162,6 +162,13 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights,
                                 int Q, int K, int D, float outScale,
                                 size_t rpetype, cudaStream_t stream);
 
+template <typename T>
+void multiplyRpeQKLogits(const T* rpeInputQ, const T* rpeWeightsQ,
+                         const T* rpeInputK, const T* rpeWeightsK,
+                         const T* attnInput, T* output, int B, int H, int Q,
+                         int K, int D, float outScale,
+                         cudaStream_t stream);
+
 template <typename T>
 void permuteTensor(T* output, const T* input, int s1, int s2, int s3, int s4,
                    int p1, int p2, int p3, int p4, cudaStream_t stream);
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index c7f1add37f..c72fbe0892 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -1922,26 +1922,37 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
 
   {
     // RPE Q and K.
-    if (mha_rpe_q_size_ > 0) {
-      // Matrix-vector multiplication for query x rpe_q
-      // Note: mha_q here is not yet transposed, so shape is still BQHD.
-      // mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K]
-      // Kernel performs the required transpositions.
-      float outScale = mha_rpe_k_size_ == 0 ? factor : 1.0f;
-      multiplyRPEAttentionLogits<DataType>(mha_q, mha_rpe_q, buffer1, buffer1,
-                                           N, encoder_heads_, 64, 64, depth,
-                                           outScale, 0, stream);
-      // dumpTensor((DataType*)buffer1, 64 * d_model * N, "from kernel", 8192);
-      // exit(0);
-    }
-    if (mha_rpe_k_size_ > 0) {
-      // Matrix-vector multiplication for key x rpe_k
-      // Note: mha_k here is not yet transposed, so shape is still BKHD.
-      // mha_k @ rpe_k: [B, K, H, D] x [D, H, Q, K]
-      // Kernel performs the required transpositions.
-      multiplyRPEAttentionLogits<DataType>(mha_k, mha_rpe_k, buffer1, buffer1,
-                                           N, encoder_heads_, 64, 64, depth,
-                                           factor, 1, stream);
+    if (mha_rpe_q_size_ > 0 && mha_rpe_k_size_ > 0) {
+      // Matrix-vector multiplication for query x rpe_q + key x rpe_k + attn
+      // Note: mha_q and mha_k here are not yet transposed, so shape is still
+      // BQHD/BKHD. mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K] mha_k @ rpe_k:
+      // [B, K, H, D] x [D, H, Q, K] Kernel performs the required
+      // transpositions.
+      multiplyRpeQKLogits<DataType>(mha_q, mha_rpe_q, mha_k, mha_rpe_k, buffer1,
+                                    buffer1, N, encoder_heads_, 64, 64, depth,
+                                    factor, stream);
+    } else {
+      // RPE Q.
+      if (mha_rpe_q_size_ > 0) {
+        // Matrix-vector multiplication for query x rpe_q
+        // Note: mha_q here is not yet transposed, so shape is still BQHD.
+        // mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K]
+        // Kernel performs the required transpositions.
+        float outScale = mha_rpe_k_size_ == 0 ? factor : 1.0f;
+        multiplyRPEAttentionLogits<DataType>(mha_q, mha_rpe_q, buffer1, buffer1,
+                                             N, encoder_heads_, 64, 64, depth,
+                                             outScale, 0, stream);
+      }
+      // RPE K.
+      if (mha_rpe_k_size_ > 0) {
+        // Matrix-vector multiplication for key x rpe_k
+        // Note: mha_k here is not yet transposed, so shape is still BKHD.
+        // mha_k @ rpe_k: [B, K, H, D] x [D, H, Q, K]
+        // Kernel performs the required transpositions.
+        multiplyRPEAttentionLogits<DataType>(mha_k, mha_rpe_k, buffer1, buffer1,
+                                             N, encoder_heads_, 64, 64, depth,
+                                             factor, 1, stream);
+      }
     }
   }
   // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
@@ -1983,8 +1994,6 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
     multiplyRPEAttentionLogits<DataType>(buffer1, mha_rpe_v, buffer2, buffer2,
                                          N, encoder_heads_, 64, 64, depth, 1.0f,
                                          2, stream);
-    // dumpTensor((DataType*)buffer2, 4096 * d_model * N, "from kernel", 8192);
-    // exit(0);
   }
 
   // #final dense layer (mha_dense), buffer2 -> buffer1