From 32881af0778ceef7bed14774cbb65405617428c7 Mon Sep 17 00:00:00 2001 From: almaudoh Date: Wed, 10 Jul 2024 03:38:10 +0200 Subject: [PATCH 1/7] Add relative positional encoding, RPE in cuda backend. --- libs/lczero-common | 2 +- src/neural/cuda/common_kernels.cu | 160 +++++++++++++-- src/neural/cuda/kernels.h | 7 + src/neural/cuda/layers.cc | 327 +++++++++++++++++++----------- src/neural/cuda/layers.h | 31 ++- src/neural/cuda/network_cuda.cc | 8 +- src/neural/network_legacy.cc | 5 +- src/neural/network_legacy.h | 3 + 8 files changed, 395 insertions(+), 148 deletions(-) diff --git a/libs/lczero-common b/libs/lczero-common index 55e1b382ef..e05fb7a505 160000 --- a/libs/lczero-common +++ b/libs/lczero-common @@ -1 +1 @@ -Subproject commit 55e1b382efadd57903e37f2a2e29caef3ea85799 +Subproject commit e05fb7a505554682acc8a197eb797c26b6db161d diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index 395bab8d84..9e058b309e 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -340,19 +340,21 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W, ReportCUDAErrors(cudaGetLastError()); } -template -__device__ dT readNCHW(const sT* input_tensor, int n, int c, int h, int w, - int Nin, int Cin, int H, int W) { - if (n >= Nin || c >= Cin) return 0; +template +__device__ dT readInputTensor(const sT* input_tensor, int i, int j, int k, + int l, int I, int J, int K, int L) { + // i is the outermost|slowest|most-significant index, while l is the + // innermost|fastest|least-significant index. + if (i >= I || j >= J || k >= K || l >= L) return 0; int index; - index = n; - index *= Cin; - index += c; - index *= H; - index += h; - index *= W; - index += w; + index = i; + index *= J; + index += j; + index *= K; + index += k; + index *= L; + index += l; return (dT)(input_tensor[index]); } @@ -376,7 +378,7 @@ __global__ void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor, int n = index; output_tensor[tid] = - readNCHW(input_tensor, n, c, h, w, Nin, Cin, H, W); + readInputTensor(input_tensor, n, c, h, w, Nin, Cin, H, W); } template @@ -1242,7 +1244,8 @@ __global__ void preprocess_for_attention_body_kernel( if (c >= input_size) { // concatenate from position encoding array if (is_pe_dense_embedding) { - op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + (c - input_size)]); + op = (T)(encoding[n * 64 * encoding_size + hw * encoding_size + + (c - input_size)]); } else { op = (T)(encoding[64 * hw + (c - input_size)]); } @@ -1309,6 +1312,126 @@ void applyInputGating(T* output, const T* input, const T* mult, const T* add, ReportCUDAErrors(cudaGetLastError()); } +// Get the index corresponding to position (i,j,k,l) in a tensor +// with dimensions (I,J,K,L) where the innermost dimension is L. +__device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I, + int J, int K, int L) { + if (i >= I || j >= J || k >= K || l >= L) return -1; + + int index; + index = i; + index *= J; + index += j; + index *= K; + index += k; + index *= L; + index += l; + + return index; +} + +template +__global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, + const T* skipAdd, T* output, int B, + int H, int Q, int K, int D, + float outScale, size_t rpetype) { + int x = threadIdx.x + blockDim.x * blockIdx.x; + int q = threadIdx.y + blockDim.y * blockIdx.y; + int bh = threadIdx.z + blockDim.z * blockIdx.z; + int h = bh % H; + int b = bh / H; + + if (rpetype == 0) { + // RPE-Q + // rpeInput: [B, Q, H, D] -> transpose to [B, H, Q, (1, D)] + // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (D, K)] + // output: [B, H, Q, K] + + // Read tensors per the input layouts and write out per the output layout. + // Sum is along the D dimension, K is along the x-axis. + int k = x; + if (b >= B || h >= H || q >= Q || k >= K) return; + + T sum = 0; + for (int i = 0; i < D; i++) { + sum += readInputTensor(rpeInput, b, q, h, i, B, Q, H, D) * + readInputTensor(rpeWeights, i, h, q, k, D, H, Q, K); + } + int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); + output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale; + } else if (rpetype == 1) { + // RPE-K + // rpeInput: [B, K, H, D] -> transpose to [B, H, K, (1, D)] + // rpeWeights: [D, H, Q, K] -> transpose to [1, H, K, (D, Q)] + // output: [B, H, Q, K] + + // Read tensors per the input layouts and write out per the output layout. + // Sum is along the D dimension, and K is along the x-axis. + int k = x; + if (b >= B || h >= H || q >= Q || k >= K) return; + + T sum = 0; + for (int i = 0; i < D; i++) { + sum += readInputTensor(rpeInput, b, k, h, i, B, K, H, D) * + readInputTensor(rpeWeights, i, h, q, k, D, H, Q, K); + } + int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); + output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale; + } else if (rpetype == 2) { + // RPE-V + // rpeInput: [B, H, Q, K] -> transpose to [B, H, Q, (1, K)] + // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (K, D)] + // output: [B, Q, H, D] + // The skip connection is also already in BQHD order. + + // Read tensors per the input layouts and write out per the output layout. + // Sum is along the K dimension, and D is along the x-axis. + int d = x; + if (b >= B || h >= H || q >= Q || d >= D) return; + + T sum = 0; + for (int i = 0; i < K; i++) { + sum += readInputTensor(rpeInput, b, h, q, i, B, H, Q, K) * + readInputTensor(rpeWeights, d, h, q, i, D, H, Q, K); + } + int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D); + output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale; + } +} + +template +void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights, + const T* attnInput, T* output, int B, int H, + int Q, int K, int D, float outScale, + size_t rpetype, cudaStream_t stream) { + if (rpetype > 2) { + throw Exception("unsupported rpetype in multiplyRPEAttentionLogits."); + } + + // rpeType: Q | K | V + // rpeInput: [B, Q, H, D] | [B, K, H, D] | [B, H, Q, K] + // rpeWeights: [D, H, Q, K] | [D, H, Q, K] | [D, H, Q, K] + + // 3D block structure where x-axis maps to K (or D for rpetype==2), y-axis to + // Q and z-axis to BH. Each thread calculates the vector sum-product for the + // cube at BHQK | BHQD (i.e. "d,dk->k" | "d,dq->q" | "k,kd->d"). + dim3 blockDim, gridDim; + int X = rpetype == 2 ? D : K; + blockDim.x = std::min(32, X); + blockDim.y = std::min(32, Q); + blockDim.z = std::min(std::max(1024 / (blockDim.x * blockDim.y), 1u), + (unsigned int)(B * H)); + gridDim.x = DivUp(X, blockDim.x); + gridDim.y = DivUp(Q, blockDim.y); + gridDim.z = DivUp(B * H, blockDim.z); + + rpeVectorMultiply_kernel<<>>( + rpeInput, rpeWeights, attnInput, output, B, H, Q, K, D, outScale, + rpetype); + + ReportCUDAErrors(cudaGetLastError()); +} + // Template instantiation. template void copyTypeConverted(half* op, float* ip, int N, cudaStream_t stream); @@ -1595,5 +1718,16 @@ template void applyInputGating(float* output, const float* input, const float* mult, const float* add, int N, int C, int output_size, cudaStream_t stream); + +template void multiplyRPEAttentionLogits( + const half* rpeInput, const half* rpeWeights, const half* attnInput, + half* output, int B, int H, int Q, int K, int D, float outScale, + size_t rpetype, cudaStream_t stream); + +template void multiplyRPEAttentionLogits( + const float* rpeInput, const float* rpeWeights, const float* attnInput, + float* output, int B, int H, int Q, int K, int D, float outScale, + size_t rpetype, cudaStream_t stream); + } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h index 483af0f4db..31db577808 100644 --- a/src/neural/cuda/kernels.h +++ b/src/neural/cuda/kernels.h @@ -155,5 +155,12 @@ void inputPreprocessForAttentionBody(T* output, const T* input, template void applyInputGating(T* output, const T* input, const T* mult, const T* add, int N, int HW, int C, cudaStream_t stream); + +template +void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights, + const T* attnInput, T* output, int B, int H, + int Q, int K, int D, float outScale, + size_t rpetype, cudaStream_t stream); + } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index 8543443897..7d0667859b 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -42,56 +42,55 @@ namespace lczero { #if 0 // debug code to dump allocation in GPU memory template -void dumpTensor(T* memory, int elements, const char* message, bool only_summary = false) { - const bool fp16 = std::is_same::value; - printf("\n%s\n", message); - int elementSize = (int) (fp16 ? sizeof(half) : sizeof(float)); - int bytes = elements * elementSize; - void *temp = malloc(bytes); - cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost); - float maxval = -std::numeric_limits::max(); - float minval = std::numeric_limits::max(); - int nans = 0; - int nanss[10] {}; - - for (int i = 0; i < elements; i++) - { - float val; - if (fp16) - { - half *arr = (half*)temp; - val = (float)arr[i]; - } - else - { - float *arr = (float *)temp; - val = arr[i]; - } - maxval = std::max(maxval, val); - minval = std::min(minval, val); +void dumpTensor(T* memory, int elements, const char* message, int lines = -1, + int start = 0) { + const bool fp16 = std::is_same::value; + printf("\n%s\n", message); + int elementSize = (int)(fp16 ? sizeof(half) : sizeof(float)); + int bytes = elements * elementSize; + void* temp = malloc(bytes); + cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost); + float maxval = -std::numeric_limits::max(); + float minval = std::numeric_limits::max(); + int nans = 0; + int nanss[10]{}; + + for (int i = 0; i < elements; i++) { + float val; + if (fp16) { + half* arr = (half*)temp; + val = (float)arr[i]; + } else { + float* arr = (float*)temp; + val = arr[i]; + } + maxval = std::max(maxval, val); + minval = std::min(minval, val); - if (std::isnan(val)) { - if (nans < 10) nanss[nans] = i; - nans++; - } + if (std::isnan(val)) { + if (nans < 10) nanss[nans] = i; + nans++; + } - if (!only_summary || i < 2 || i == elements - 1) { - // printf("%8.4f ", val); - // if ((i % 8) == 7) printf("\n"); - printf("%i;%.6f\n", i, val); - } + if ((i >= start && (i < start + lines || lines == -1)) || + i == elements - 1) { + // printf("%8.4f ", val); + // if ((i % 8) == 7) printf("\n"); + printf("%i;%.6f\n", i, val); } - free(temp); - if (maxval == -std::numeric_limits::max()) - maxval = std::numeric_limits::quiet_NaN(); - if (minval == std::numeric_limits::max()) - minval = std::numeric_limits::quiet_NaN(); - - printf("Max: %.6f, Min: %.6f, NaNs: %i of %i", maxval, minval, nans, elements); - printf("\nNaN indices: "); - for (int i=0; i 10) printf("......"); - printf("\n"); + } + free(temp); + if (maxval == -std::numeric_limits::max()) + maxval = std::numeric_limits::quiet_NaN(); + if (minval == std::numeric_limits::max()) + minval = std::numeric_limits::quiet_NaN(); + + printf("Max: %.6f, Min: %.6f, NaNs: %i of %i", maxval, minval, nans, + elements); + printf("\nNaN indices: "); + for (int i = 0; i < nans && i < 10; i++) printf("%i ", nanss[i]); + if (nans > 10) printf("......"); + printf("\n"); } #endif @@ -1419,6 +1418,68 @@ void allocAndUpload(DataType** gpu_dest, std::vector cpu_src, (int)cpu_src.size(), 0); } +template +static void cublasXgemm(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + float alpha, const DataType* A, int lda, + const DataType* B, int ldb, float beta, DataType* C, + int ldc) { + const bool fp16 = std::is_same::value; + if (fp16) { + unsigned short alpha_h = FP32toFP16(alpha); + unsigned short beta_h = FP32toFP16(beta); + ReportCUBLASErrors(cublasHgemm( + handle, transa, transb, m, n, k, (const half*)&alpha_h, (const half*)A, + lda, (const half*)B, ldb, (const half*)&beta_h, (half*)C, ldc)); + } else { + ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha, + (const float*)A, lda, (const float*)B, ldb, + &beta, (float*)C, ldc)); + } +} + +template +static void cublasXGemmStridedBatched( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, float alpha, const void* A, int lda, + long long int strideA, const void* B, int ldb, long long int strideB, + float beta, void* C, int ldc, long long int strideC, int batchCount) { + const bool fp16 = std::is_same::value; + if (fp16) { + unsigned short alpha_h = FP32toFP16(alpha); + unsigned short beta_h = FP32toFP16(beta); + ReportCUBLASErrors(cublasGemmStridedBatchedEx( + handle, transa, transb, m, n, k, &alpha_h, A, CUDA_R_16F, lda, strideA, + B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC, + batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT)); + } else { + ReportCUBLASErrors(cublasGemmStridedBatchedEx( + handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B, + CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC, + batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); + } +} + +template +static void cublasXGemmBatched(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + float alpha, DataType** A, int lda, DataType** B, + int ldb, float beta, DataType** C, int ldc, + int batchCount) { + const bool fp16 = std::is_same::value; + if (fp16) { + unsigned short alpha_h = FP32toFP16(alpha); + unsigned short beta_h = FP32toFP16(beta); + ReportCUBLASErrors(cublasHgemmBatched( + handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda, + (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount)); + } else { + ReportCUBLASErrors(cublasSgemmBatched( + handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B, + ldb, &beta, (float**)C, ldc, batchCount)); + } +} + template AttentionPolicyHead::AttentionPolicyHead( BaseLayer* ip, const MultiHeadWeights::PolicyHead& weights, @@ -1579,67 +1640,73 @@ EncoderBlock::EncoderBlock( // GPU memory already allocated in AttentionBody. smol_global = smolgen_global_scratch; } -} -template -static void cublasXgemm(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - float alpha, const DataType* A, int lda, - const DataType* B, int ldb, float beta, DataType* C, - int ldc) { - const bool fp16 = std::is_same::value; - if (fp16) { - unsigned short alpha_h = FP32toFP16(alpha); - unsigned short beta_h = FP32toFP16(beta); - ReportCUBLASErrors(cublasHgemm( - handle, transa, transb, m, n, k, (const half*)&alpha_h, (const half*)A, - lda, (const half*)B, ldb, (const half*)&beta_h, (half*)C, ldc)); - } else { - ReportCUBLASErrors(cublasSgemm(handle, transa, transb, m, n, k, &alpha, - (const float*)A, lda, (const float*)B, ldb, - &beta, (float*)C, ldc)); - } -} + // RPE weights. + if (cpu_weights.mha.rpe_q.size() > 0 || cpu_weights.mha.rpe_k.size() > 0 || + cpu_weights.mha.rpe_v.size() > 0) { + // Weights factorizer. + int rows = 15 * 15; + int cols = 64 * 64; + int row, col; + std::vector rpe_map(rows * cols); + // 15 * 15 in units for distance pairs to 64 * 64 pairs of squares. + // Distance pairs mapped on rows, while square pairs mapped on columns. + for (auto i = 0; i < 8; i++) { + for (auto j = 0; j < 8; j++) { + for (auto k = 0; k < 8; k++) { + for (auto l = 0; l < 8; l++) { + row = 15 * (i - k + 7) + (j - l + 7); + col = 64 * (i * 8 + j) + k * 8 + l; + rpe_map[row * cols + col] = 1.0f; + } + } + } + } + allocAndUpload(&mha_rpe_factorizer, rpe_map, scratch); -template -static void cublasXGemmStridedBatched( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, float alpha, const void* A, int lda, - long long int strideA, const void* B, int ldb, long long int strideB, - float beta, void* C, int ldc, long long int strideC, int batchCount) { - const bool fp16 = std::is_same::value; - if (fp16) { - unsigned short alpha_h = FP32toFP16(alpha); - unsigned short beta_h = FP32toFP16(beta); - ReportCUBLASErrors(cublasGemmStridedBatchedEx( - handle, transa, transb, m, n, k, &alpha_h, A, CUDA_R_16F, lda, strideA, - B, CUDA_R_16F, ldb, strideB, &beta_h, C, CUDA_R_16F, ldc, strideC, - batchCount, CUDA_R_16F, CUBLAS_GEMM_DEFAULT)); - } else { - ReportCUBLASErrors(cublasGemmStridedBatchedEx( - handle, transa, transb, m, n, k, &alpha, A, CUDA_R_32F, lda, strideA, B, - CUDA_R_32F, ldb, strideB, &beta, C, CUDA_R_32F, ldc, strideC, - batchCount, CUDA_R_32F, CUBLAS_GEMM_DEFAULT)); - } -} + // We need a cublas instance for the gemm. Create a temporary one. + cublasHandle_t tmp_cublas; + ReportCUBLASErrors(cublasCreate(&tmp_cublas)); -template -static void cublasXGemmBatched(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - float alpha, DataType** A, int lda, DataType** B, - int ldb, float beta, DataType** C, int ldc, - int batchCount) { - const bool fp16 = std::is_same::value; - if (fp16) { - unsigned short alpha_h = FP32toFP16(alpha); - unsigned short beta_h = FP32toFP16(beta); - ReportCUBLASErrors(cublasHgemmBatched( - handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda, - (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount)); - } else { - ReportCUBLASErrors(cublasSgemmBatched( - handle, transa, transb, m, n, k, &alpha, (float**)A, lda, (float**)B, - ldb, &beta, (float**)C, ldc, batchCount)); + // Allocate RPE weights and multiply by factorizer + DataType* rpe_scratch; + mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size(); + if (mha_rpe_q_size_ > 0) { + allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_q, scratch); + + // Gemm to factorize the RPE Q weights. + ReportCUDAErrors( + cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType))); + cublasXgemm(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096, + mha_q_size_, 225, 1.0f, mha_rpe_factorizer, 4096, + rpe_scratch, mha_q_size_, 0.0f, + (DataType*)mha_rpe_q, 4096); + } + mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size(); + if (mha_rpe_k_size_ > 0) { + allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_k, scratch); + + // Gemm to factorize the RPE K weights. + ReportCUDAErrors( + cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType))); + cublasXgemm(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096, + mha_k_size_, 225, 1.0f, mha_rpe_factorizer, 4096, + rpe_scratch, mha_k_size_, 0.0f, + (DataType*)mha_rpe_k, 4096); + } + mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size(); + if (mha_rpe_v_size_ > 0) { + allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_v, scratch); + + // Gemm to factorize the RPE V weights. + ReportCUDAErrors( + cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType))); + cublasXgemm(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096, + mha_v_size_, 225, 1.0f, mha_rpe_factorizer, 4096, + rpe_scratch, mha_v_size_, 0.0f, + (DataType*)mha_rpe_v, 4096); + } + cublasDestroy(tmp_cublas); } } @@ -1749,18 +1816,7 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, // (Maybe not, we can play with strides of the gemm and do independent gemms // for each encoder head) - // Apply scaled dot product attention: - /* - matmul_qk = tf.matmul(q, k, transpose_b=True) - dk = tf.cast(tf.shape(k)[-1], self.model_dtype) - scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) - attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) - output = tf.matmul(attention_weights, v) - */ - - // shape(k)[-1] = depth - float factor = 1.0f / sqrt((float)depth); - + // Apply dot product attention // matmul_qk = tf.matmul(q, k, transpose_b=True) { if (*offset_pointers == nullptr) { @@ -1790,7 +1846,7 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/, depth /*K*/, // A/B, and M/N are swapped for row-major to col-major // transform - factor, // to handle "/ tf.math.sqrt(dk)" + 1.0, // Scaling done after RPE logits *offset_pointers, // mha_k + offset /*A*/, d_model /*LDA*/, // (d_model = depth * encoder_heads_) to skip over // other "depth" slices / heads @@ -1808,6 +1864,30 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, N * encoder_heads_); } + // Scaling of attention logits is done after RPE. + float factor = 1.0f / sqrt((float)depth); + { + // RPE Q and K. + if (mha_rpe_q_size_ > 0) { + // Matrix-vector multiplication for query x rpe_q + // Note: mha_q here is not yet transposed, so shape is still BQHD. + // mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K] + // Kernel performs the required transpositions. + float outScale = mha_rpe_k_size_ == 0 ? factor : 1.0f; + multiplyRPEAttentionLogits(mha_q, mha_rpe_q, buffer1, buffer1, + N, encoder_heads_, 64, 64, depth, + outScale, 0, stream); + } + if (mha_rpe_k_size_ > 0) { + // Matrix-vector multiplication for key x rpe_k + // Note: mha_k here is not yet transposed, so shape is still BKHD. + // mha_k @ rpe_k: [B, K, H, D] x [D, H, Q, K] + // Kernel performs the required transpositions. + multiplyRPEAttentionLogits(mha_k, mha_rpe_k, buffer1, buffer1, + N, encoder_heads_, 64, 64, depth, + factor, 1, stream); + } + } // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1) // attention_weights -> buffer1 if (has_smolgen_) { @@ -1838,6 +1918,17 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, N * encoder_heads_); } + if (mha_rpe_v_size_ > 0) { + // RPE-V + // Matrix-vector multiplication for attn x rpe_v + // attn @ rpe_v: [B, H, Q, K] x [D, H, Q, K] + // Kernel performs the required transpositions. + // The matmul result (buffer2) is already with BQHD shape. + multiplyRPEAttentionLogits(buffer1, mha_rpe_v, buffer2, buffer2, + N, encoder_heads_, 64, 64, depth, 1.0f, + 2, stream); + } + // #final dense layer (mha_dense), buffer2 -> buffer1 { const int num_inputs = d_model; diff --git a/src/neural/cuda/layers.h b/src/neural/cuda/layers.h index de563f9346..8eb2172181 100644 --- a/src/neural/cuda/layers.h +++ b/src/neural/cuda/layers.h @@ -353,6 +353,8 @@ class EncoderBlock { DataType *mha_v_w, *mha_v_b; DataType *mha_qkv_w, *mha_qkv_b; DataType *mha_dense_w, *mha_dense_b; + DataType *mha_rpe_q, *mha_rpe_k, *mha_rpe_v; + DataType *mha_rpe_factorizer; DataType *ln1_gammas, *ln1_betas; @@ -372,6 +374,9 @@ class EncoderBlock { int mha_k_size_; int mha_v_size_; int mha_dense_size_; + int mha_rpe_q_size_; + int mha_rpe_k_size_; + int mha_rpe_v_size_; int ffn_dense1_size_; int ffn_dense2_size_; @@ -379,7 +384,7 @@ class EncoderBlock { int embedding_op_size_; int encoder_heads_; - float alpha_; // scale to apply to skip connection add + float alpha_; // scale to apply to skip connection add float default_eps_; // value of epsilon where it wasn't specified in training const bool has_smolgen_; @@ -485,13 +490,18 @@ class AttentionBody : public BaseLayer { private: // GPU allocations to hold various weights used by the attention net body. - DataType *ip_emb_pre_w_, *ip_emb_pre_b_; // input position preprocessing weights. - DataType *ip_emb_w_, *ip_emb_b_; // "embedding" layer in net body - DataType *ip_emb_ln_g_, *ip_emb_ln_b_; // input embedding layernorm gamma and beta - DataType *ip_mult_gate_, *ip_add_gate_; // input gating - DataType *ip_emb_ffn_d1_w_, *ip_emb_ffn_d1_b_; // input embedding FFN dense1 weights - DataType *ip_emb_ffn_d2_w_, *ip_emb_ffn_d2_b_; // input embedding FFN dense2 weights - DataType *ip_emb_ffn_ln_g_, *ip_emb_ffn_ln_b_; // input embedding FFN layernorm gamma and beta + DataType *ip_emb_pre_w_, + *ip_emb_pre_b_; // input position preprocessing weights. + DataType *ip_emb_w_, *ip_emb_b_; // "embedding" layer in net body + DataType *ip_emb_ln_g_, + *ip_emb_ln_b_; // input embedding layernorm gamma and beta + DataType *ip_mult_gate_, *ip_add_gate_; // input gating + DataType *ip_emb_ffn_d1_w_, + *ip_emb_ffn_d1_b_; // input embedding FFN dense1 weights + DataType *ip_emb_ffn_d2_w_, + *ip_emb_ffn_d2_b_; // input embedding FFN dense2 weights + DataType *ip_emb_ffn_ln_g_, + *ip_emb_ffn_ln_b_; // input embedding FFN layernorm gamma and beta DataType *smolgen_global_; // global smolgen weights for all encoder layers DataType *pos_encoding_; int embedding_dense_size_; @@ -523,8 +533,8 @@ class ValueHead : public BaseLayer { public: ValueHead(BaseLayer* ip, const MultiHeadWeights::ValueHead& weights, - void* scratch, bool attention_body, bool wdl, ActivationFunction act, - int max_batch_size, bool use_gemm_ex); + void* scratch, bool attention_body, bool wdl, + ActivationFunction act, int max_batch_size, bool use_gemm_ex); ~ValueHead(); void Eval(int N, DataType* output, const DataType* input, const DataType* input2, void* scratch, size_t scratch_size, @@ -548,6 +558,5 @@ class ValueHead : public BaseLayer { ActivationFunction act_; }; - } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc index cf67d1336c..420b92d1fb 100644 --- a/src/neural/cuda/network_cuda.cc +++ b/src/neural/cuda/network_cuda.cc @@ -121,8 +121,8 @@ static size_t getMaxAttentionBodySize(const MultiHeadWeights& weights, int N) { template class CudaNetworkComputation : public NetworkComputation { public: - CudaNetworkComputation(CudaNetwork* network, - bool wdl, bool moves_left); + CudaNetworkComputation(CudaNetwork* network, bool wdl, + bool moves_left); ~CudaNetworkComputation(); void AddInput(InputPlanes&& input) override { @@ -530,8 +530,8 @@ class CudaNetwork : public Network { pblczero::NetworkFormat::VALUE_WDL; BaseLayer* lastlayer = attn_body_ ? encoder_last_ : resi_last_; auto value_main = std::make_unique>( - lastlayer, head, scratch_mem_, attn_body_, wdl_, act, - max_batch_size_, use_gemm_ex); + lastlayer, head, scratch_mem_, attn_body_, wdl_, act, max_batch_size_, + use_gemm_ex); network_.emplace_back(std::move(value_main)); } diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc index 53846353c6..44f79c4517 100644 --- a/src/neural/network_legacy.cc +++ b/src/neural/network_legacy.cc @@ -142,7 +142,10 @@ BaseWeights::MHA::MHA(const pblczero::Weights::MHA& mha) dense_w(LayerAdapter(mha.dense_w()).as_vector()), dense_b(LayerAdapter(mha.dense_b()).as_vector()), smolgen(Smolgen(mha.smolgen())), - has_smolgen(mha.has_smolgen()) {} + has_smolgen(mha.has_smolgen()), + rpe_q(LayerAdapter(mha.rpe_q()).as_vector()), + rpe_k(LayerAdapter(mha.rpe_k()).as_vector()), + rpe_v(LayerAdapter(mha.rpe_v()).as_vector()) {} BaseWeights::FFN::FFN(const pblczero::Weights::FFN& ffn) : dense1_w(LayerAdapter(ffn.dense1_w()).as_vector()), diff --git a/src/neural/network_legacy.h b/src/neural/network_legacy.h index 72ce67544f..7ad5db82ba 100644 --- a/src/neural/network_legacy.h +++ b/src/neural/network_legacy.h @@ -81,6 +81,9 @@ struct BaseWeights { Vec dense_b; Smolgen smolgen; bool has_smolgen; + Vec rpe_q; + Vec rpe_k; + Vec rpe_v; }; struct FFN { From 9614d656c5a051e04a5a8fa701de6ac1ca02cfe5 Mon Sep 17 00:00:00 2001 From: almaudoh Date: Wed, 10 Jul 2024 04:11:17 +0200 Subject: [PATCH 2/7] Fix attention scaling factor for non-rpe nets. --- src/neural/cuda/common_kernels.cu | 6 +++--- src/neural/cuda/layers.cc | 21 ++++++++++++++++----- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index 9e058b309e..d7c61197df 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -1348,7 +1348,7 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, // output: [B, H, Q, K] // Read tensors per the input layouts and write out per the output layout. - // Sum is along the D dimension, K is along the x-axis. + // Sum is along the D dimension, and K is on the x-axis. int k = x; if (b >= B || h >= H || q >= Q || k >= K) return; @@ -1366,7 +1366,7 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, // output: [B, H, Q, K] // Read tensors per the input layouts and write out per the output layout. - // Sum is along the D dimension, and K is along the x-axis. + // Sum is along the D dimension, and K is on the x-axis. int k = x; if (b >= B || h >= H || q >= Q || k >= K) return; @@ -1385,7 +1385,7 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, // The skip connection is also already in BQHD order. // Read tensors per the input layouts and write out per the output layout. - // Sum is along the K dimension, and D is along the x-axis. + // Sum is along the K dimension, and D is on the x-axis. int d = x; if (b >= B || h >= H || q >= Q || d >= D) return; diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index 7d0667859b..a7a930d29b 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -1641,7 +1641,7 @@ EncoderBlock::EncoderBlock( smol_global = smolgen_global_scratch; } - // RPE weights. + // RPE weights. /* if (cpu_weights.mha.rpe_q.size() > 0 || cpu_weights.mha.rpe_k.size() > 0 || cpu_weights.mha.rpe_v.size() > 0) { // Weights factorizer. @@ -1816,7 +1816,18 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, // (Maybe not, we can play with strides of the gemm and do independent gemms // for each encoder head) - // Apply dot product attention + // Apply scaled dot product attention: + /* + matmul_qk = tf.matmul(q, k, transpose_b=True) + dk = tf.cast(tf.shape(k)[-1], self.model_dtype) + scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) + attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) + output = tf.matmul(attention_weights, v) + */ + + // shape(k)[-1] = depth + float factor = 1.0f / sqrt((float)depth); + // matmul_qk = tf.matmul(q, k, transpose_b=True) { if (*offset_pointers == nullptr) { @@ -1846,7 +1857,9 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/, depth /*K*/, // A/B, and M/N are swapped for row-major to col-major // transform - 1.0, // Scaling done after RPE logits + mha_rpe_q_size_ > 0 || mha_rpe_k_size_ > 0 + ? 1.0f + : factor, // in RPE nets, scaling is done after RPE logits *offset_pointers, // mha_k + offset /*A*/, d_model /*LDA*/, // (d_model = depth * encoder_heads_) to skip over // other "depth" slices / heads @@ -1864,8 +1877,6 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, N * encoder_heads_); } - // Scaling of attention logits is done after RPE. - float factor = 1.0f / sqrt((float)depth); { // RPE Q and K. if (mha_rpe_q_size_ > 0) { From 2ee1637780a55891c4c268ce443f3730f8851b3d Mon Sep 17 00:00:00 2001 From: almaudoh Date: Mon, 15 Jul 2024 02:07:20 +0200 Subject: [PATCH 3/7] Minor optimization of kernel. Not much perf diff. --- src/neural/cuda/common_kernels.cu | 106 ++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index d7c61197df..5fdb5a728a 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -340,21 +340,19 @@ void addBias_NCHW(T* c, T* a, T* b, int N, int C, int H, int W, ReportCUDAErrors(cudaGetLastError()); } -template -__device__ dT readInputTensor(const sT* input_tensor, int i, int j, int k, - int l, int I, int J, int K, int L) { - // i is the outermost|slowest|most-significant index, while l is the - // innermost|fastest|least-significant index. - if (i >= I || j >= J || k >= K || l >= L) return 0; +template +__device__ dT readNCHW(const sT* input_tensor, int n, int c, int h, int w, + int Nin, int Cin, int H, int W) { + if (n >= Nin || c >= Cin) return 0; int index; - index = i; - index *= J; - index += j; - index *= K; - index += k; - index *= L; - index += l; + index = n; + index *= Cin; + index += c; + index *= H; + index += h; + index *= W; + index += w; return (dT)(input_tensor[index]); } @@ -378,7 +376,7 @@ __global__ void NCHWtoNHWC_kernel(dT* output_tensor, const sT* input_tensor, int n = index; output_tensor[tid] = - readInputTensor(input_tensor, n, c, h, w, Nin, Cin, H, W); + readNCHW(input_tensor, n, c, h, w, Nin, Cin, H, W); } template @@ -1318,16 +1316,40 @@ __device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I, int J, int K, int L) { if (i >= I || j >= J || k >= K || l >= L) return -1; - int index; - index = i; - index *= J; - index += j; - index *= K; - index += k; - index *= L; - index += l; - - return index; + return (((((i * J) + j) * K) + k) * L) + l; + + // int index; + // index = i; + // index *= J; + // index += j; + // index *= K; + // index += k; + // index *= L; + // index += l; + + + // return index; +} + +template +__device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i, size_t j, size_t k, + size_t l, size_t I, size_t J, size_t K, size_t L) { + // i is the outermost|slowest|most-significant index, while l is the + // innermost|fastest|least-significant index. + if (i >= I || j >= J || k >= K || l >= L) return 0; + + // int index; + // index = i; + // index *= J; + // index += j; + // index *= K; + // index += k; + // index *= L; + // index += l; + // int index = getTensorIndex(i, j, k, l, I, J, K, L); + size_t index = (((((i * J) + j) * K) + k) * L) + l; + + return (dT)(input_tensor[index]); } template @@ -1353,12 +1375,17 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, if (b >= B || h >= H || q >= Q || k >= K) return; T sum = 0; - for (int i = 0; i < D; i++) { - sum += readInputTensor(rpeInput, b, q, h, i, B, Q, H, D) * - readInputTensor(rpeWeights, i, h, q, k, D, H, Q, K); + const T* ip = rpeInput + getTensorIndex(b, q, h, 0, B, Q, H, D); + const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K); + const int wtstep = H * Q * K; + + for (int i = 0, j = 0; i < D; i++, j += wtstep) { + sum += ip[i] * wt[j]; + // sum += readInputTensor(rpeInput, b, q, h, i, B, Q, H, D) * + // readInputTensor(rpeWeights, i, h, q, k, D, H, Q, K); } int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); - output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale; + output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; } else if (rpetype == 1) { // RPE-K // rpeInput: [B, K, H, D] -> transpose to [B, H, K, (1, D)] @@ -1371,12 +1398,17 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, if (b >= B || h >= H || q >= Q || k >= K) return; T sum = 0; - for (int i = 0; i < D; i++) { - sum += readInputTensor(rpeInput, b, k, h, i, B, K, H, D) * - readInputTensor(rpeWeights, i, h, q, k, D, H, Q, K); + const T* ip = rpeInput + getTensorIndex(b, k, h, 0, B, K, H, D); + const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K); + const int wtstep = H * Q * K; + + for (int i = 0, j = 0; i < D; i++, j += wtstep) { + sum += ip[i] * wt[j]; + // sum += readInputTensor(rpeInput, b, k, h, 0, B, K, H, D) * + // readInputTensor(rpeWeights, 0, h, q, k, D, H, Q, K); } int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); - output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale; + output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; } else if (rpetype == 2) { // RPE-V // rpeInput: [B, H, Q, K] -> transpose to [B, H, Q, (1, K)] @@ -1390,12 +1422,16 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, if (b >= B || h >= H || q >= Q || d >= D) return; T sum = 0; + const T* ip = rpeInput + getTensorIndex(b, h, q, 0, B, H, Q, K); + const T* wt = rpeWeights + getTensorIndex(d, h, q, 0, D, H, Q, K); + for (int i = 0; i < K; i++) { - sum += readInputTensor(rpeInput, b, h, q, i, B, H, Q, K) * - readInputTensor(rpeWeights, d, h, q, i, D, H, Q, K); + sum += ip[i] * wt[i]; + // sum += readInputTensor(rpeInput, b, h, q, 0, B, H, Q, K) * + // readInputTensor(rpeWeights, d, h, q, 0, D, H, Q, K); } int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D); - output[outIdx] = (sum + skipAdd[outIdx]) * (T)outScale; + output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; } } From 506a4c4647792e954b4ccec8c6ebd411ab9c4cff Mon Sep 17 00:00:00 2001 From: almaudoh Date: Sun, 21 Jul 2024 05:26:43 +0200 Subject: [PATCH 4/7] Improved speed by optimizing rpe kernel. Still not enough yet. --- src/neural/cuda/common_kernels.cu | 145 ++++++++++++++++++++++-------- src/neural/cuda/kernels.h | 4 + src/neural/cuda/layers.cc | 39 +++++--- 3 files changed, 141 insertions(+), 47 deletions(-) diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index 5fdb5a728a..6ddfdbf49f 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -1316,7 +1316,7 @@ __device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I, int J, int K, int L) { if (i >= I || j >= J || k >= K || l >= L) return -1; - return (((((i * J) + j) * K) + k) * L) + l; + return (((((i * J) + j) * K) + k) * L) + l; // int index; // index = i; @@ -1327,13 +1327,14 @@ __device__ __forceinline__ int getTensorIndex(int i, int j, int k, int l, int I, // index *= L; // index += l; - // return index; } template -__device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i, size_t j, size_t k, - size_t l, size_t I, size_t J, size_t K, size_t L) { +__device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i, + size_t j, size_t k, size_t l, + size_t I, size_t J, size_t K, + size_t L) { // i is the outermost|slowest|most-significant index, while l is the // innermost|fastest|least-significant index. if (i >= I || j >= J || k >= K || l >= L) return 0; @@ -1352,6 +1353,39 @@ __device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i, return (dT)(input_tensor[index]); } +template +__device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length, + bool fp16) { + T sum = 0; + // Load from memory (16 elements a time) + if (fp16) { + half u[8]; + half v[8]; +#pragma unroll + for (int h = 0; h < length; h += 8) { + copyAs(&u[0], &U[h]); + copyAs(&v[0], &V[h]); +#pragma unroll + for (int i = 0; i < 8; i++) { + sum += (T)u[i] * (T)v[i]; + } + } + } else { + float u[8]; + float v[8]; +#pragma unroll + for (int h = 0; h < length; h += 4) { + copyAs(&u[0], &U[h]); + copyAs(&v[0], &V[h]); +#pragma unroll + for (int i = 0; i < 8; i++) { + sum += u[i] * v[i]; + } + } + } + return sum; +} + template __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, const T* skipAdd, T* output, int B, @@ -1362,11 +1396,12 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, int bh = threadIdx.z + blockDim.z * blockIdx.z; int h = bh % H; int b = bh / H; + const bool fp16 = std::is_same::value; if (rpetype == 0) { // RPE-Q // rpeInput: [B, Q, H, D] -> transpose to [B, H, Q, (1, D)] - // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (D, K)] + // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (D, K)] // output: [B, H, Q, K] // Read tensors per the input layouts and write out per the output layout. @@ -1374,22 +1409,17 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, int k = x; if (b >= B || h >= H || q >= Q || k >= K) return; - T sum = 0; - const T* ip = rpeInput + getTensorIndex(b, q, h, 0, B, Q, H, D); - const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K); - const int wtstep = H * Q * K; + const int tensorIndex = getTensorIndex(b, q, h, 0, B, Q, H, D); + const int weightIndex = getTensorIndex(h, q, k, 0, H, Q, K, D); - for (int i = 0, j = 0; i < D; i++, j += wtstep) { - sum += ip[i] * wt[j]; - // sum += readInputTensor(rpeInput, b, q, h, i, B, Q, H, D) * - // readInputTensor(rpeWeights, i, h, q, k, D, H, Q, K); - } + T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D, + fp16); int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; } else if (rpetype == 1) { // RPE-K // rpeInput: [B, K, H, D] -> transpose to [B, H, K, (1, D)] - // rpeWeights: [D, H, Q, K] -> transpose to [1, H, K, (D, Q)] + // rpeWeights: [H, K, Q, D] -> transpose to [1, H, K, (D, Q)] // output: [B, H, Q, K] // Read tensors per the input layouts and write out per the output layout. @@ -1397,22 +1427,16 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, int k = x; if (b >= B || h >= H || q >= Q || k >= K) return; - T sum = 0; - const T* ip = rpeInput + getTensorIndex(b, k, h, 0, B, K, H, D); - const T* wt = rpeWeights + getTensorIndex(0, h, q, k, D, H, Q, K); - const int wtstep = H * Q * K; - - for (int i = 0, j = 0; i < D; i++, j += wtstep) { - sum += ip[i] * wt[j]; - // sum += readInputTensor(rpeInput, b, k, h, 0, B, K, H, D) * - // readInputTensor(rpeWeights, 0, h, q, k, D, H, Q, K); - } + const int tensorIndex = getTensorIndex(b, k, h, 0, B, K, H, D); + const int weightIndex = getTensorIndex(h, k, q, 0, H, K, Q, D); + T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D, + fp16); int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; } else if (rpetype == 2) { // RPE-V // rpeInput: [B, H, Q, K] -> transpose to [B, H, Q, (1, K)] - // rpeWeights: [D, H, Q, K] -> transpose to [1, H, Q, (K, D)] + // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (K, D)] // output: [B, Q, H, D] // The skip connection is also already in BQHD order. @@ -1421,15 +1445,10 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, int d = x; if (b >= B || h >= H || q >= Q || d >= D) return; - T sum = 0; - const T* ip = rpeInput + getTensorIndex(b, h, q, 0, B, H, Q, K); - const T* wt = rpeWeights + getTensorIndex(d, h, q, 0, D, H, Q, K); - - for (int i = 0; i < K; i++) { - sum += ip[i] * wt[i]; - // sum += readInputTensor(rpeInput, b, h, q, 0, B, H, Q, K) * - // readInputTensor(rpeWeights, d, h, q, 0, D, H, Q, K); - } + const int tensorIndex = getTensorIndex(b, h, q, 0, B, H, Q, K); + const int weightIndex = getTensorIndex(h, q, d, 0, H, Q, D, K); + T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, K, + fp16); int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D); output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; } @@ -1468,6 +1487,54 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights, ReportCUDAErrors(cudaGetLastError()); } +template +__global__ void permuteTensor_kernel(T* output, const T* input, int s1, int s2, + int s3, int s4, int p1, int p2, int p3, + int p4) { + // Shared memory for shape of tensor. + __shared__ int tshp[4]; + if (threadIdx.x == 0) { + tshp[0] = s1; + tshp[1] = s2; + tshp[2] = s3; + tshp[3] = s4; + } + __syncthreads(); + + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if (tid >= s1 * s2 * s3 * s4) return; + + int tloc[] = {0, 0, 0, 0}; + int index = tid; + + tloc[3] = (index % s4); + index /= s4; + tloc[2] = index % s3; + index /= s3; + tloc[1] = index % s2; + index /= s2; + tloc[0] = index; + + int outIdx = (((((tloc[p1] * tshp[p2]) + tloc[p2]) * tshp[p3]) + tloc[p3]) * + tshp[p4]) + + tloc[p4]; + output[outIdx] = (T)input[tid]; +} + +template +void permuteTensor(T* output, const T* input, int s1, int s2, int s3, int s4, + int p1, int p2, int p3, int p4, cudaStream_t stream) { + // The order and bounds arrays are assumed to have 4 elements. + int elements = s1 * s2 * s3 * s4; + const int kBlockSize = 1024; + int blocks = DivUp(elements, kBlockSize); + + permuteTensor_kernel<<>>( + output, input, s1, s2, s3, s4, p1, p2, p3, p4); + ReportCUDAErrors(cudaGetLastError()); +} + // Template instantiation. template void copyTypeConverted(half* op, float* ip, int N, cudaStream_t stream); @@ -1765,5 +1832,13 @@ template void multiplyRPEAttentionLogits( float* output, int B, int H, int Q, int K, int D, float outScale, size_t rpetype, cudaStream_t stream); +template void permuteTensor(half* output, const half* input, int s1, + int s2, int s3, int s4, int p1, int p2, + int p3, int p4, cudaStream_t stream); + +template void permuteTensor(float* output, const float* input, int s1, + int s2, int s3, int s4, int p1, int p2, + int p3, int p4, cudaStream_t stream); + } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h index 31db577808..15571b6acf 100644 --- a/src/neural/cuda/kernels.h +++ b/src/neural/cuda/kernels.h @@ -162,5 +162,9 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights, int Q, int K, int D, float outScale, size_t rpetype, cudaStream_t stream); +template +void permuteTensor(T* output, const T* input, int s1, int s2, int s3, int s4, + int p1, int p2, int p3, int p4, cudaStream_t stream); + } // namespace cudnn_backend } // namespace lczero diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index a7a930d29b..0efe24c8b3 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -1670,41 +1670,56 @@ EncoderBlock::EncoderBlock( // Allocate RPE weights and multiply by factorizer DataType* rpe_scratch; + int heads = encoder_heads_; + int depth = mha_q_size_ / encoder_heads_; mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size(); if (mha_rpe_q_size_ > 0) { allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_q, scratch); // Gemm to factorize the RPE Q weights. - ReportCUDAErrors( - cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType))); cublasXgemm(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096, mha_q_size_, 225, 1.0f, mha_rpe_factorizer, 4096, - rpe_scratch, mha_q_size_, 0.0f, - (DataType*)mha_rpe_q, 4096); + rpe_scratch, mha_q_size_, 0.0f, (DataType*)scratch, + 4096); + + // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, D, K] + // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, K, D] + ReportCUDAErrors( + cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType))); + permuteTensor((DataType*)mha_rpe_q, (const DataType*)scratch, depth, heads, 64, + 64, 1, 2, 3, 0, 0); } mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size(); if (mha_rpe_k_size_ > 0) { allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_k, scratch); // Gemm to factorize the RPE K weights. - ReportCUDAErrors( - cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType))); cublasXgemm(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096, mha_k_size_, 225, 1.0f, mha_rpe_factorizer, 4096, - rpe_scratch, mha_k_size_, 0.0f, - (DataType*)mha_rpe_k, 4096); + rpe_scratch, mha_k_size_, 0.0f, (DataType*)scratch, + 4096); + + // Permute RPE K weights: [D, H, Q, K] -> [H, K, D, Q] + ReportCUDAErrors( + cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType))); + permuteTensor((DataType*)mha_rpe_k, (const DataType*)scratch, depth, + heads, 64, 64, 1, 3, 2, 0, 0); } mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size(); if (mha_rpe_v_size_ > 0) { allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_v, scratch); // Gemm to factorize the RPE V weights. - ReportCUDAErrors( - cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType))); cublasXgemm(tmp_cublas, CUBLAS_OP_N, CUBLAS_OP_T, 4096, mha_v_size_, 225, 1.0f, mha_rpe_factorizer, 4096, - rpe_scratch, mha_v_size_, 0.0f, - (DataType*)mha_rpe_v, 4096); + rpe_scratch, mha_v_size_, 0.0f, (DataType*)scratch, + 4096); + + // Permute RPE V weights: [D, H, Q, K] -> [H, Q, K, D] + ReportCUDAErrors( + cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType))); + permuteTensor((DataType*)mha_rpe_v, (const DataType*)scratch, depth, + heads, 64, 64, 1, 2, 0, 3, 0); } cublasDestroy(tmp_cublas); } From 70d267287aa33dfa9c282b1a14b3dda3343aa870 Mon Sep 17 00:00:00 2001 From: almaudoh Date: Sun, 21 Jul 2024 20:15:00 +0200 Subject: [PATCH 5/7] Fix bugs in fp32 and non-rpe code paths. --- src/neural/cuda/common_kernels.cu | 2 +- src/neural/cuda/layers.cc | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index 6ddfdbf49f..9337e26bea 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -1378,7 +1378,7 @@ __device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length, copyAs(&u[0], &U[h]); copyAs(&v[0], &V[h]); #pragma unroll - for (int i = 0; i < 8; i++) { + for (int i = 0; i < 4; i++) { sum += u[i] * v[i]; } } diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index 0efe24c8b3..e610214870 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -1641,9 +1641,12 @@ EncoderBlock::EncoderBlock( smol_global = smolgen_global_scratch; } - // RPE weights. /* - if (cpu_weights.mha.rpe_q.size() > 0 || cpu_weights.mha.rpe_k.size() > 0 || - cpu_weights.mha.rpe_v.size() > 0) { + // RPE weights. + mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size(); + mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size(); + mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size(); + + if (mha_rpe_q_size_ > 0 || mha_rpe_k_size_ > 0 || mha_rpe_v_size_ > 0) { // Weights factorizer. int rows = 15 * 15; int cols = 64 * 64; @@ -1672,7 +1675,6 @@ EncoderBlock::EncoderBlock( DataType* rpe_scratch; int heads = encoder_heads_; int depth = mha_q_size_ / encoder_heads_; - mha_rpe_q_size_ = cpu_weights.mha.rpe_q.size(); if (mha_rpe_q_size_ > 0) { allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_q, scratch); @@ -1686,10 +1688,9 @@ EncoderBlock::EncoderBlock( // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, K, D] ReportCUDAErrors( cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType))); - permuteTensor((DataType*)mha_rpe_q, (const DataType*)scratch, depth, heads, 64, - 64, 1, 2, 3, 0, 0); + permuteTensor((DataType*)mha_rpe_q, (const DataType*)scratch, depth, + heads, 64, 64, 1, 2, 3, 0, 0); } - mha_rpe_k_size_ = cpu_weights.mha.rpe_k.size(); if (mha_rpe_k_size_ > 0) { allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_k, scratch); @@ -1705,7 +1706,6 @@ EncoderBlock::EncoderBlock( permuteTensor((DataType*)mha_rpe_k, (const DataType*)scratch, depth, heads, 64, 64, 1, 3, 2, 0, 0); } - mha_rpe_v_size_ = cpu_weights.mha.rpe_v.size(); if (mha_rpe_v_size_ > 0) { allocAndUpload(&rpe_scratch, cpu_weights.mha.rpe_v, scratch); From 75a099fe36b74f908c6bb8b9386fe9327428c028 Mon Sep 17 00:00:00 2001 From: almaudoh Date: Mon, 2 Sep 2024 14:13:50 +0200 Subject: [PATCH 6/7] Split kernel dot product workload among multiple threads in warp --- src/neural/cuda/common_kernels.cu | 226 +++++++++++++++++++++++++----- src/neural/cuda/layers.cc | 58 ++++++-- 2 files changed, 237 insertions(+), 47 deletions(-) diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index 9337e26bea..2d410f3b9b 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -37,6 +37,7 @@ namespace lczero { namespace cudnn_backend { namespace { constexpr int kInputPlanes = 112; +constexpr int kNumRpeKernelSplits = 2; } // namespace ///////////////////////////////////////////////////////////////////////////// @@ -1353,16 +1354,123 @@ __device__ __forceinline__ dT readInputTensor(const sT* input_tensor, size_t i, return (dT)(input_tensor[index]); } +__device__ __forceinline__ float sharedDotProductSum(float val, int x, int y) { + // Sum is done along the x-axis, while y is the accumulator. + int warpPos = y % 32; + __shared__ float partialSum[32]; + if (x == 0) partialSum[warpPos] = 0.0f; + + __syncthreads(); + + // Get warp-wide sum. + float warpSum = warpReduce(val); + if (x == 0) atomicAdd(&partialSum[warpPos], warpSum); + + __syncthreads(); + + return partialSum[warpPos]; +} + +template +__global__ void rpeVectorMultiply_parallel_kernel( + const T* rpeInput, const T* rpeWeights, const T* skipAdd, T* output, int B, + int H, int Q, int K, int D, float outScale, size_t rpetype) { + int x = threadIdx.x; + int y = threadIdx.y + blockDim.y * blockIdx.y; + int z = threadIdx.z + blockDim.z * blockIdx.z; + int h = z % H; + z = z / H; + int q = z % Q; + z = z / Q; + int b = z % B; + + if (rpetype == 0) { + // RPE-Q + // rpeInput: [B, Q, H, D] -> transpose to [B, H, Q, (1, D)] + // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (D, K)] + // output: [B, H, Q, K] + + // Read tensors per the input layouts and write out per the output layout. + // Sum is along the D dimension, and K is on the x-axis. + // Each thread handles one product of the sum. Thread 0 sums the products. + int d = x; + int k = y; + if (b >= B || h >= H || q >= Q || k >= K || d >= D) return; + + const int tensorIndex = getTensorIndex(b, q, h, d, B, Q, H, D); + const int weightIndex = getTensorIndex(h, q, k, d, H, Q, K, D); + + T sum = (T)sharedDotProductSum( + (float)rpeInput[tensorIndex] * (float)rpeWeights[weightIndex], x, y); + + if (d == 0) { + int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); + output[outIdx] = (sum + (T)skipAdd[outIdx]) * (T)outScale; + } + } else if (rpetype == 1) { + // RPE-K + // rpeInput: [B, K, H, D] -> transpose to [B, H, K, (1, D)] + // rpeWeights: [H, K, Q, D] -> transpose to [1, H, K, (D, Q)] + // output: [B, H, Q, K] + + // Read tensors per the input layouts and write out per the output layout. + // Sum is along the D dimension, and K is on the x-axis. + // Each thread handles one product of the sum. Thread 0 sums the products. + int d = x; + int k = y; + if (b >= B || h >= H || q >= Q || k >= K || d >= D) return; + + const int tensorIndex = getTensorIndex(b, k, h, d, B, K, H, D); + const int weightIndex = getTensorIndex(h, k, q, d, H, K, Q, D); + + T sum = (T)sharedDotProductSum( + (float)rpeInput[tensorIndex] * (float)rpeWeights[weightIndex], x, y); + + if (d == 0) { + int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); + output[outIdx] = (sum + (T)skipAdd[outIdx]) * (T)outScale; + } + } else if (rpetype == 2) { + // RPE-V + // rpeInput: [B, H, Q, K] -> transpose to [B, H, Q, (1, K)] + // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (K, D)] + // output: [B, Q, H, D] + // The skip connection is also already in BQHD order. + + // Read tensors per the input layouts and write out per the output layout. + // Sum is along the K dimension, and D is on the x-axis. + int k = x; + int d = y; + if (b >= B || h >= H || q >= Q || k >= K || d >= D) return; + + const int tensorIndex = getTensorIndex(b, h, q, k, B, H, Q, K); + const int weightIndex = getTensorIndex(h, q, d, k, H, Q, D, K); + + T sum = (T)sharedDotProductSum( + (float)rpeInput[tensorIndex] * (float)rpeWeights[weightIndex], x, y); + + if (k == 0) { + int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D); + output[outIdx] = (sum + (T)skipAdd[outIdx]) * (T)outScale; + } + } +} + template -__device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length, - bool fp16) { +__device__ __forceinline__ T dotProductSum(int x, const T* U, const T* V, + int length, bool fp16) { + assert(length >= 16); T sum = 0; + int sublen = length / kNumRpeKernelSplits; + int lane = x & (kNumRpeKernelSplits - 1); + int start = lane * sublen; + // Load from memory (16 elements a time) if (fp16) { half u[8]; half v[8]; #pragma unroll - for (int h = 0; h < length; h += 8) { + for (int h = start; h < start + sublen; h += 8) { copyAs(&u[0], &U[h]); copyAs(&v[0], &V[h]); #pragma unroll @@ -1371,18 +1479,25 @@ __device__ __forceinline__ T dotProductSum(const T* U, const T* V, int length, } } } else { - float u[8]; - float v[8]; + float u[4]; + float v[4]; #pragma unroll - for (int h = 0; h < length; h += 4) { + for (int h = start; h < start + sublen; h += 4) { copyAs(&u[0], &U[h]); copyAs(&v[0], &V[h]); #pragma unroll for (int i = 0; i < 4; i++) { - sum += u[i] * v[i]; + sum += (T)u[i] * (T)v[i]; } } } + + // Warp-level reduction to sum up adjacent threads. + __syncwarp(); +#pragma unroll + for (int i = 1; i < kNumRpeKernelSplits; i = i << 1) { + sum += __shfl_down_sync(0xffffffff, sum, i); + } return sum; } @@ -1391,31 +1506,48 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, const T* skipAdd, T* output, int B, int H, int Q, int K, int D, float outScale, size_t rpetype) { - int x = threadIdx.x + blockDim.x * blockIdx.x; - int q = threadIdx.y + blockDim.y * blockIdx.y; - int bh = threadIdx.z + blockDim.z * blockIdx.z; - int h = bh % H; - int b = bh / H; + const int x = threadIdx.x + blockDim.x * blockIdx.x; + const int q = threadIdx.y + blockDim.y * blockIdx.y; + const int bh = threadIdx.z + blockDim.z * blockIdx.z; + const int h = bh % H; + const int b = bh / H; const bool fp16 = std::is_same::value; + const int lane = x & (kNumRpeKernelSplits - 1); if (rpetype == 0) { // RPE-Q // rpeInput: [B, Q, H, D] -> transpose to [B, H, Q, (1, D)] - // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (D, K)] + // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (D, K)] // output: [B, H, Q, K] // Read tensors per the input layouts and write out per the output layout. // Sum is along the D dimension, and K is on the x-axis. - int k = x; + int k = x / kNumRpeKernelSplits; if (b >= B || h >= H || q >= Q || k >= K) return; const int tensorIndex = getTensorIndex(b, q, h, 0, B, Q, H, D); const int weightIndex = getTensorIndex(h, q, k, 0, H, Q, K, D); - T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D, - fp16); - int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); - output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; + T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex, + D, fp16); + + // if (b == 0 && h == 0 && q == 0 && k == 0) { + // // if (lane == 0) { + // // sum += __shfl_down_sync(0xffffffff, sum, 1); + // // sum = __shfl_down_sync(0xffffffff, sum, 1); + // // return sum; + // // printf("\n"); + // for (int i = 0; i < 32; i++) { + // printf("x: %i, lane: %i, thread warp id: %d, sum: %f\n", x, lane, + // i, (float)__shfl_sync(0xffffffff, sum, i)); + // } + // // } + // printf("x: %d, sum: %f\n", x, (float)sum); + // } + if (lane == 0) { + int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); + output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; + } } else if (rpetype == 1) { // RPE-K // rpeInput: [B, K, H, D] -> transpose to [B, H, K, (1, D)] @@ -1424,33 +1556,37 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, // Read tensors per the input layouts and write out per the output layout. // Sum is along the D dimension, and K is on the x-axis. - int k = x; + int k = x / kNumRpeKernelSplits; if (b >= B || h >= H || q >= Q || k >= K) return; const int tensorIndex = getTensorIndex(b, k, h, 0, B, K, H, D); const int weightIndex = getTensorIndex(h, k, q, 0, H, K, Q, D); - T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, D, - fp16); - int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); - output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; + T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex, + D, fp16); + if (lane == 0) { + int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); + output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; + } } else if (rpetype == 2) { // RPE-V // rpeInput: [B, H, Q, K] -> transpose to [B, H, Q, (1, K)] - // rpeWeights: [H, Q, K, D] -> transpose to [1, H, Q, (K, D)] + // rpeWeights: [H, Q, D, K] -> transpose to [1, H, Q, (K, D)] // output: [B, Q, H, D] // The skip connection is also already in BQHD order. // Read tensors per the input layouts and write out per the output layout. // Sum is along the K dimension, and D is on the x-axis. - int d = x; + int d = x / kNumRpeKernelSplits; if (b >= B || h >= H || q >= Q || d >= D) return; const int tensorIndex = getTensorIndex(b, h, q, 0, B, H, Q, K); const int weightIndex = getTensorIndex(h, q, d, 0, H, Q, D, K); - T sum = dotProductSum(rpeInput + tensorIndex, rpeWeights + weightIndex, K, - fp16); - int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D); - output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; + T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex, + K, fp16); + if (lane == 0) { + int outIdx = getTensorIndex(b, q, h, d, B, Q, H, D); + output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; + } } } @@ -1465,25 +1601,47 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights, // rpeType: Q | K | V // rpeInput: [B, Q, H, D] | [B, K, H, D] | [B, H, Q, K] - // rpeWeights: [D, H, Q, K] | [D, H, Q, K] | [D, H, Q, K] + // rpeWeights: [H, Q, K, D] | [H, K, Q, D] | [H, Q, D, K] + // attnInput: [B, H, Q, K] | [B, H, Q, K] | [B, Q, H, D] + // int lda = (rpetype == 2) ? K : D; + // if (lda > 64 || rpetype != 0) { +#if 1 // 3D block structure where x-axis maps to K (or D for rpetype==2), y-axis to // Q and z-axis to BH. Each thread calculates the vector sum-product for the // cube at BHQK | BHQD (i.e. "d,dk->k" | "d,dq->q" | "k,kd->d"). dim3 blockDim, gridDim; int X = rpetype == 2 ? D : K; blockDim.x = std::min(32, X); - blockDim.y = std::min(32, Q); - blockDim.z = std::min(std::max(1024 / (blockDim.x * blockDim.y), 1u), + blockDim.y = std::min(16, Q); + blockDim.z = std::min(std::max(512 / (blockDim.x * blockDim.y), 1u), (unsigned int)(B * H)); - gridDim.x = DivUp(X, blockDim.x); + gridDim.x = DivUp(X * kNumRpeKernelSplits, blockDim.x); gridDim.y = DivUp(Q, blockDim.y); gridDim.z = DivUp(B * H, blockDim.z); rpeVectorMultiply_kernel<<>>( rpeInput, rpeWeights, attnInput, output, B, H, Q, K, D, outScale, rpetype); - +#else + // } else { + // 3D block structure where x-axis maps to D (or K for rpetype==2), y-axis + // maps to K (or D for rpetype==2) and z-axis to BQH. Each thread calculates + // the product while the warp-sum does the sum of the vector dot-product. + dim3 blockDim, gridDim; + int Y = rpetype == 2 ? D : K; + blockDim.x = std::min(64, lda); + blockDim.y = std::min(DivUp(1024, blockDim.x), Y); + blockDim.z = std::max(1024 / (blockDim.x * blockDim.y), 1u); + gridDim.x = 1; + gridDim.y = DivUp(Y, blockDim.y); + gridDim.z = DivUp(B * H * Q, blockDim.z); + + rpeVectorMultiply_parallel_kernel<<>>( + rpeInput, rpeWeights, attnInput, output, B, H, Q, K, D, outScale, + rpetype); +#endif + // } ReportCUDAErrors(cudaGetLastError()); } diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index e610214870..c7f1add37f 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -39,7 +39,27 @@ namespace lczero { -#if 0 +#if 1 +// function to calculate mean +static float mean(float arr[], int n) { + float sum = 0; + for (int i = 0; i < n; i++) { + sum += arr[i]; + } + return sum / n; +} + +// function to calculate standard deviation +static float stdDev(float arr[], int n) { + float m = mean(arr, n); // get the mean + float var = 0; // initialize variance + for (int i = 0; i < n; i++) { + var += pow(arr[i] - m, 2); // add the squared difference from mean + } + var /= n; // divide by number of elements + return sqrt(var); // return the square root of variance +} + // debug code to dump allocation in GPU memory template void dumpTensor(T* memory, int elements, const char* message, int lines = -1, @@ -52,9 +72,10 @@ void dumpTensor(T* memory, int elements, const char* message, int lines = -1, cudaMemcpy(temp, memory, bytes, cudaMemcpyDeviceToHost); float maxval = -std::numeric_limits::max(); float minval = std::numeric_limits::max(); - int nans = 0; - int nanss[10]{}; + int cnans = 0; + int nans[10]{}; + std::vector fpArr(elements); for (int i = 0; i < elements; i++) { float val; if (fp16) { @@ -64,12 +85,13 @@ void dumpTensor(T* memory, int elements, const char* message, int lines = -1, float* arr = (float*)temp; val = arr[i]; } + fpArr[i] = val; maxval = std::max(maxval, val); minval = std::min(minval, val); if (std::isnan(val)) { - if (nans < 10) nanss[nans] = i; - nans++; + if (cnans < 10) nans[cnans] = i; + cnans++; } if ((i >= start && (i < start + lines || lines == -1)) || @@ -85,11 +107,18 @@ void dumpTensor(T* memory, int elements, const char* message, int lines = -1, if (minval == std::numeric_limits::max()) minval = std::numeric_limits::quiet_NaN(); - printf("Max: %.6f, Min: %.6f, NaNs: %i of %i", maxval, minval, nans, - elements); - printf("\nNaN indices: "); - for (int i = 0; i < nans && i < 10; i++) printf("%i ", nanss[i]); - if (nans > 10) printf("......"); + float avg = mean(&fpArr[0], elements); + float stddev = stdDev(&fpArr[0], elements); + printf( + "Max: %.6f, Min: %.6f, Mean: %.6f, StdDev: %.6f\n" + "NaNs: %i of %i", + maxval, minval, avg, stddev, cnans, elements); + + if (cnans > 0) { + printf("\nNaN indices: "); + for (int i = 0; i < cnans && i < 10; i++) printf("%i ", nans[i]); + if (cnans > 10) printf("......"); + } printf("\n"); } #endif @@ -1684,7 +1713,6 @@ EncoderBlock::EncoderBlock( rpe_scratch, mha_q_size_, 0.0f, (DataType*)scratch, 4096); - // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, D, K] // Permute RPE Q weights: [D, H, Q, K] -> [H, Q, K, D] ReportCUDAErrors( cudaMalloc(&mha_rpe_q, mha_q_size_ * 4096 * sizeof(DataType))); @@ -1700,7 +1728,7 @@ EncoderBlock::EncoderBlock( rpe_scratch, mha_k_size_, 0.0f, (DataType*)scratch, 4096); - // Permute RPE K weights: [D, H, Q, K] -> [H, K, D, Q] + // Permute RPE K weights: [D, H, Q, K] -> [H, K, Q, D] ReportCUDAErrors( cudaMalloc(&mha_rpe_k, mha_k_size_ * 4096 * sizeof(DataType))); permuteTensor((DataType*)mha_rpe_k, (const DataType*)scratch, depth, @@ -1715,7 +1743,7 @@ EncoderBlock::EncoderBlock( rpe_scratch, mha_v_size_, 0.0f, (DataType*)scratch, 4096); - // Permute RPE V weights: [D, H, Q, K] -> [H, Q, K, D] + // Permute RPE V weights: [D, H, Q, K] -> [H, Q, D, K] ReportCUDAErrors( cudaMalloc(&mha_rpe_v, mha_v_size_ * 4096 * sizeof(DataType))); permuteTensor((DataType*)mha_rpe_v, (const DataType*)scratch, depth, @@ -1903,6 +1931,8 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, multiplyRPEAttentionLogits(mha_q, mha_rpe_q, buffer1, buffer1, N, encoder_heads_, 64, 64, depth, outScale, 0, stream); + // dumpTensor((DataType*)buffer1, 64 * d_model * N, "from kernel", 8192); + // exit(0); } if (mha_rpe_k_size_ > 0) { // Matrix-vector multiplication for key x rpe_k @@ -1953,6 +1983,8 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, multiplyRPEAttentionLogits(buffer1, mha_rpe_v, buffer2, buffer2, N, encoder_heads_, 64, 64, depth, 1.0f, 2, stream); + // dumpTensor((DataType*)buffer2, 4096 * d_model * N, "from kernel", 8192); + // exit(0); } // #final dense layer (mha_dense), buffer2 -> buffer1 From 237766a90330fc33bafd566f80784d56b4bd5810 Mon Sep 17 00:00:00 2001 From: almaudoh Date: Sun, 8 Sep 2024 19:36:30 +0200 Subject: [PATCH 7/7] Fuse RPE-Q and RPE-K kernels. 10% speedup. --- src/neural/cuda/common_kernels.cu | 84 ++++++++++++++++++++++++++----- src/neural/cuda/kernels.h | 7 +++ src/neural/cuda/layers.cc | 53 +++++++++++-------- 3 files changed, 109 insertions(+), 35 deletions(-) diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index 2d410f3b9b..7b0bb9f277 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -1531,19 +1531,6 @@ __global__ void rpeVectorMultiply_kernel(const T* rpeInput, const T* rpeWeights, T sum = dotProductSum(x, rpeInput + tensorIndex, rpeWeights + weightIndex, D, fp16); - // if (b == 0 && h == 0 && q == 0 && k == 0) { - // // if (lane == 0) { - // // sum += __shfl_down_sync(0xffffffff, sum, 1); - // // sum = __shfl_down_sync(0xffffffff, sum, 1); - // // return sum; - // // printf("\n"); - // for (int i = 0; i < 32; i++) { - // printf("x: %i, lane: %i, thread warp id: %d, sum: %f\n", x, lane, - // i, (float)__shfl_sync(0xffffffff, sum, i)); - // } - // // } - // printf("x: %d, sum: %f\n", x, (float)sum); - // } if (lane == 0) { int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); output[outIdx] = ((T)sum + (T)skipAdd[outIdx]) * (T)outScale; @@ -1645,6 +1632,67 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights, ReportCUDAErrors(cudaGetLastError()); } +template +__global__ void rpeQK_multiply_kernel(const T* rpeInputQ, const T* rpeWeightsQ, + const T* rpeInputK, const T* rpeWeightsK, + const T* skipAdd, T* output, int B, int H, + int Q, int K, int D, float outScale) { + // Fused version of rpeVectorMultiply_kernel for RPE-Q and RPE-K. + const int x = threadIdx.x + blockDim.x * blockIdx.x; + const int q = threadIdx.y + blockDim.y * blockIdx.y; + const int bh = threadIdx.z + blockDim.z * blockIdx.z; + const int h = bh % H; + const int b = bh / H; + const bool fp16 = std::is_same::value; + const int lane = x & (kNumRpeKernelSplits - 1); + + // Read tensors per the input layouts and write out per the output layout. + // Sum is along the D dimension, and K is on the x-axis. + int k = x / kNumRpeKernelSplits; + if (b >= B || h >= H || q >= Q || k >= K) return; + + // RPE-Q sum + const int tidxq = getTensorIndex(b, q, h, 0, B, Q, H, D); + const int widxq = getTensorIndex(h, q, k, 0, H, Q, K, D); + T sum1 = dotProductSum(x, rpeInputQ + tidxq, rpeWeightsQ + widxq, D, fp16); + + // RPE-K sum. + const int tidxk = getTensorIndex(b, k, h, 0, B, K, H, D); + const int widxk = getTensorIndex(h, k, q, 0, H, K, Q, D); + T sum2 = dotProductSum(x, rpeInputK + tidxk, rpeWeightsK + widxk, D, fp16); + + // Write out the result. + if (lane == 0) { + int outIdx = getTensorIndex(b, h, q, k, B, H, Q, K); + output[outIdx] = (T)(sum1 + sum2 + skipAdd[outIdx]) * (T)outScale; + } +} + +template +void multiplyRpeQKLogits(const T* rpeInputQ, const T* rpeWeightsQ, + const T* rpeInputK, const T* rpeWeightsK, + const T* attnInput, T* output, int B, int H, int Q, + int K, int D, float outScale, cudaStream_t stream) { + // rpeType: Q | K + // rpeInput: [B, Q, H, D] | [B, K, H, D] + // rpeWeights: [H, Q, K, D] | [H, K, Q, D] + // attnInput: [B, H, Q, K] | [B, H, Q, K] + dim3 blockDim, gridDim; + blockDim.x = std::min(32, K); + blockDim.y = std::min(16, Q); + blockDim.z = std::min(std::max(512 / (blockDim.x * blockDim.y), 1u), + (unsigned int)(B * H)); + gridDim.x = DivUp(K * kNumRpeKernelSplits, blockDim.x); + gridDim.y = DivUp(Q, blockDim.y); + gridDim.z = DivUp(B * H, blockDim.z); + + rpeQK_multiply_kernel<<>>( + rpeInputQ, rpeWeightsQ, rpeInputK, rpeWeightsK, attnInput, output, B, H, + Q, K, D, outScale); + + ReportCUDAErrors(cudaGetLastError()); +} + template __global__ void permuteTensor_kernel(T* output, const T* input, int s1, int s2, int s3, int s4, int p1, int p2, int p3, @@ -1990,6 +2038,16 @@ template void multiplyRPEAttentionLogits( float* output, int B, int H, int Q, int K, int D, float outScale, size_t rpetype, cudaStream_t stream); +template void multiplyRpeQKLogits( + const half* rpeInputQ, const half* rpeWeightsQ, const half* rpeInputK, + const half* rpeWeightsK, const half* attnInput, half* output, int B, int H, + int Q, int K, int D, float outScale, cudaStream_t stream); + +template void multiplyRpeQKLogits( + const float* rpeInputQ, const float* rpeWeightsQ, const float* rpeInputK, + const float* rpeWeightsK, const float* attnInput, float* output, int B, + int H, int Q, int K, int D, float outScale, cudaStream_t stream); + template void permuteTensor(half* output, const half* input, int s1, int s2, int s3, int s4, int p1, int p2, int p3, int p4, cudaStream_t stream); diff --git a/src/neural/cuda/kernels.h b/src/neural/cuda/kernels.h index 15571b6acf..966207f378 100644 --- a/src/neural/cuda/kernels.h +++ b/src/neural/cuda/kernels.h @@ -162,6 +162,13 @@ void multiplyRPEAttentionLogits(const T* rpeInput, const T* rpeWeights, int Q, int K, int D, float outScale, size_t rpetype, cudaStream_t stream); +template +void multiplyRpeQKLogits(const T* rpeInputQ, const T* rpeWeightsQ, + const T* rpeInputK, const T* rpeWeightsK, + const T* attnInput, T* output, int B, int H, int Q, + int K, int D, float outScale, + cudaStream_t stream); + template void permuteTensor(T* output, const T* input, int s1, int s2, int s3, int s4, int p1, int p2, int p3, int p4, cudaStream_t stream); diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index c7f1add37f..c72fbe0892 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -1922,26 +1922,37 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, { // RPE Q and K. - if (mha_rpe_q_size_ > 0) { - // Matrix-vector multiplication for query x rpe_q - // Note: mha_q here is not yet transposed, so shape is still BQHD. - // mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K] - // Kernel performs the required transpositions. - float outScale = mha_rpe_k_size_ == 0 ? factor : 1.0f; - multiplyRPEAttentionLogits(mha_q, mha_rpe_q, buffer1, buffer1, - N, encoder_heads_, 64, 64, depth, - outScale, 0, stream); - // dumpTensor((DataType*)buffer1, 64 * d_model * N, "from kernel", 8192); - // exit(0); - } - if (mha_rpe_k_size_ > 0) { - // Matrix-vector multiplication for key x rpe_k - // Note: mha_k here is not yet transposed, so shape is still BKHD. - // mha_k @ rpe_k: [B, K, H, D] x [D, H, Q, K] - // Kernel performs the required transpositions. - multiplyRPEAttentionLogits(mha_k, mha_rpe_k, buffer1, buffer1, - N, encoder_heads_, 64, 64, depth, - factor, 1, stream); + if (mha_rpe_q_size_ > 0 && mha_rpe_k_size_ > 0) { + // Matrix-vector multiplication for query x rpe_q + key x rpe_k + attn + // Note: mha_q and mha_k here are not yet transposed, so shape is still + // BQHD/BKHD. mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K] mha_k @ rpe_k: + // [B, K, H, D] x [D, H, Q, K] Kernel performs the required + // transpositions. + multiplyRpeQKLogits(mha_q, mha_rpe_q, mha_k, mha_rpe_k, buffer1, + buffer1, N, encoder_heads_, 64, 64, depth, + factor, stream); + } else { + // RPE Q. + if (mha_rpe_q_size_ > 0) { + // Matrix-vector multiplication for query x rpe_q + // Note: mha_q here is not yet transposed, so shape is still BQHD. + // mha_q @ rpe_q: [B, Q, H, D] x [D, H, Q, K] + // Kernel performs the required transpositions. + float outScale = mha_rpe_k_size_ == 0 ? factor : 1.0f; + multiplyRPEAttentionLogits(mha_q, mha_rpe_q, buffer1, buffer1, + N, encoder_heads_, 64, 64, depth, + outScale, 0, stream); + } + // RPE K. + if (mha_rpe_k_size_ > 0) { + // Matrix-vector multiplication for key x rpe_k + // Note: mha_k here is not yet transposed, so shape is still BKHD. + // mha_k @ rpe_k: [B, K, H, D] x [D, H, Q, K] + // Kernel performs the required transpositions. + multiplyRPEAttentionLogits(mha_k, mha_rpe_k, buffer1, buffer1, + N, encoder_heads_, 64, 64, depth, + factor, 1, stream); + } } } // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1) @@ -1983,8 +1994,6 @@ void EncoderBlock::Eval(int N, DataType* in_out_tensor, multiplyRPEAttentionLogits(buffer1, mha_rpe_v, buffer2, buffer2, N, encoder_heads_, 64, 64, depth, 1.0f, 2, stream); - // dumpTensor((DataType*)buffer2, 4096 * d_model * N, "from kernel", 8192); - // exit(0); } // #final dense layer (mha_dense), buffer2 -> buffer1