add old qkv code as fallback for gpus with cc < 5.3

LeelaChessZero · Apr 13, 2023 · f1bb681 · f1bb681
1 parent a6ce7b6
commit f1bb681
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 4 deletions.
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
@@ -1772,7 +1772,7 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
   float factor = 1.0f / sqrt((float)depth);
 
   // matmul_qk = tf.matmul(q, k, transpose_b=True)
-  {
+  if (use_gemm_ex_) {
     if (*offset_pointers == nullptr) {
       std::vector<DataType*> offsets(encoder_heads_ * max_batch_size_ * 5);
       for (int i = 0; i < encoder_heads_ * max_batch_size_; i++) {
@@ -1816,6 +1816,27 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         64 /*LDC*/,
         // 64 * 64 /*strideC*/,
         N * encoder_heads_);
+  } else {
+    for (int i = 0; i < encoder_heads_; i++) {
+      int offset = i * depth;
+      // layout of the output: encoder_heads_ * Batch * 64 * 64
+      int outOffset = i * N * 64 * 64;
+      cublasXGemmStridedBatched<DataType>(
+          cublas, CUBLAS_OP_T, CUBLAS_OP_N, 64 /*M*/, 64 /*N*/,
+          depth /*K*/,  // A/B, and M/N are swapped for row-major to col-major
+                        // transform
+          factor,       // to handle "/ tf.math.sqrt(dk)"
+          mha_k + offset /*A*/,
+          d_model /*LDA*/,  // (d_model = depth * encoder_heads_) to skip over
+                            // other "depth" slices / heads
+          64 * d_model,     /*strideA*/
+          mha_q + offset /*B*/,
+          d_model /*LDB*/,  // to skip over other other "depth" slices / heads
+          64 * d_model,     /*strideB*/
+          0.0f,
+          buffer1 + outOffset /*C*/,  // output (matmul_qk) goes to buffer1
+          64 /*LDC*/, 64 * 64 /*strideC*/, N, false);
+    }
   }
 
   // attention_weights = tf.nn.softmax(scaled_attention_logits, axis = -1)
@@ -1829,7 +1850,7 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
             (const DataType*)nullptr, stream);
   }
 
-  {
+  if (use_gemm_ex_) {
     cublasXGemmBatched<DataType>(
         cublas, CUBLAS_OP_N, CUBLAS_OP_N, depth /*M*/, 64 /*N*/, 64 /*K*/, 1.0f,
         *offset_pointers + encoder_heads_ * max_batch_size_ *
@@ -1846,6 +1867,20 @@ void EncoderBlock<DataType>::Eval(int N, DataType* in_out_tensor,
         d_model /*LDC*/,
         // 64 * d_model /*strideC*/,
         N * encoder_heads_);
+  } else {
+    for (int i = 0; i < encoder_heads_; i++) {
+      int offset = i * depth;  // for output and "v" matrix
+      // layout: encoder_heads_ * Batch*64*64
+      int weightsOffset = i * N * 64 * 64;
+      cublasXGemmStridedBatched<DataType>(
+          cublas, CUBLAS_OP_N, CUBLAS_OP_N, depth /*M*/, 64 /*N*/, 64 /*K*/,
+          1.0f, mha_v + offset /*A*/,  // "v" matrix
+          d_model /*LDA*/,       // to skip over other "depth" slices / heads
+          64 * d_model,          /*strideA*/
+          buffer1 + weightsOffset /*B*/, 64 /*LDB*/, 64 * 64, /*strideB*/
+          0.0f, buffer2 + offset /*C*/,  // output goes to buffer2
+          d_model /*LDC*/, 64 * d_model /*strideC*/, N, false);
+    }
   }
 
   // #final dense layer (mha_dense), buffer2 -> buffer1

diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc
@@ -305,7 +305,8 @@ class CudaNetwork : public Network {
       use_res_block_winograd_fuse_opt_ = options.Get<bool>("res_block_fusing");
     }
 
-    const bool use_gemm_ex = deviceProp.major >= 5;
+    const bool use_gemm_ex = (deviceProp.major > 5) ||
+                             (deviceProp.major == 5 && deviceProp.minor >= 3);
 
     // 0. Check for SE.
     has_se_ = false;

diff --git a/src/neural/cuda/network_cudnn.cc b/src/neural/cuda/network_cudnn.cc
@@ -317,7 +317,8 @@ class CudnnNetwork : public Network {
       }
     }
 
-    const bool use_gemm_ex = deviceProp.major >= 5;
+    const bool use_gemm_ex = (deviceProp.major > 5) ||
+                             (deviceProp.major == 5 && deviceProp.minor >= 3);
 
     // 0. Check for SE.
     has_se_ = false;