Merge branch 'master' into cuda_blas_37

LeelaChessZero · Apr 13, 2023 · a6ce7b6 · a6ce7b6
2 parents e3d75f0 + f39ad6c
commit a6ce7b6
Show file tree

Hide file tree

Showing 53 changed files with 4,593 additions and 2,053 deletions.
diff --git a/README.md b/README.md
@@ -9,27 +9,27 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, s
 
 Lc0 can be acquired either via a git clone or an archive download from GitHub. Be aware that there is a required submodule which isn't included in source archives.
 
-For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.28`), which is equivalent to using the latest version tag.
+For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.29`), which is equivalent to using the latest version tag.
 
 Versioning follows the Semantic Versioning guidelines, with major, minor and patch sections. The training server enforces game quality using the versions output by the client and engine.
 
 
 Download using git:
 
 ```
-git clone -b release/0.28 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
+git clone -b release/0.29 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
 ```
 
 If you have cloned already an old version, fetch, view and checkout a new branch:
 ```
 git fetch --all
 git branch --all
-git checkout -t remotes/origin/release/0.28
+git checkout -t remotes/origin/release/0.29
 ```
 
 
 If you prefer to download an archive, you need to also download and place the submodule:
- * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.28) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.28) archive is also available)
+ * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.29) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.29) archive is also available)
  * Extract
  * Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
  * Move the second archive into the first archive's `libs/lczero-common/` folder and extract

diff --git a/appveyor.yml b/appveyor.yml
@@ -116,10 +116,15 @@ before_build:
 - cmd: IF %DX%==true SET BUILD_BLAS=true
 - cmd: SET EMBED=false
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true SET EMBED=true
+- cmd: SET POPCNT=true
+- cmd: IF %NAME%==cpu-openblas SET POPCNT=false
+- cmd: SET F16C=true
+- cmd: IF %NAME%==cpu-openblas SET F16C=false
+- cmd: IF %CUDA%==true SET F16C=false
 - cmd: SET EXTRA=
 - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md
 - cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include
-- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
+- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
 - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64
 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false
 build_script:

diff --git a/build.cmd b/build.cmd
@@ -30,7 +30,11 @@ set CXX=cl
 set CC_LD=link
 set CXX_LD=link
 
-if exist "C:\Program Files (x86)\Microsoft Visual Studio\2019" (
+if exist "C:\Program Files\Microsoft Visual Studio\2022" (
+  where /q cl
+  if errorlevel 1 call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
+  set backend=vs2022
+) else if exist "C:\Program Files (x86)\Microsoft Visual Studio\2019" (
   where /q cl
   if errorlevel 1 call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
   set backend=vs2019

diff --git a/changelog.txt b/changelog.txt
@@ -1,4 +1,35 @@
-v0.29.0-rc0 (2022-04-03)
+v0.29.0 (2022-12-13)
+~~~~~~~
+* Updated onednn version to the latest one.
+
+v0.29.0-rc1 (2022-12-09)
+~~~~~~~
+* New metal backend for apple systems. This is now the default backend for
+  macos builds.
+* New onnx-dml backend to use DirectML under windows, has better net
+  compatibility than dx12 and is faster than opencl. See the README for use
+  instructions, a separate download of the DirectML dll is required.
+* Full attention policy support in cuda, cudnn, metal, onnx, blas, dnnl, and
+  eigen backends.
+* Partial attention policy support in onednn backend (good enough for T79).
+* Now the onnx backends can use fp16 when running with a network file (not with
+  .onnx model files). This is the default for onnx-cuda and onnx-dml, can be
+  switched on or off with by setting the `fp16` backend option to `true` or
+  `false` respectively.
+* The onednn package comes with a dnnl compiled to allow running on an intel gpu
+  by adding `gpu=0` to the backend options.
+* The default net is now 791556 for most backends except opencl and dx12 that
+  get 753723 (as they lack attention policy support).
+* Support for using pgn book with long lines in training: selfplay can start at
+  a random point in the book.
+* New "simple" time manager.
+* Support for double Fischer random chess (dfrc).
+* Added TC-dependent output to the backendbench assistant.
+* Starting with this version, the check backend compares policy for valid moves
+  after softmax.
+* Some assorted fixes and code cleanups.
+
+v0.29.0-rc0 (2022-04-03)
 ~~~~~~~
 * Initial support for attention policy, only cuda backend and partially in
   blas/dnnl/eigen (good enough for T79).

diff --git a/dist/README-onnx-dml.txt b/dist/README-onnx-dml.txt
@@ -4,8 +4,9 @@ Lc0 is a UCI-compliant chess engine designed to play chess via
 neural network, specifically those of the LeelaChessZero project
 (https://lczero.org).
 
-To run this version you will most likely need a very recent DirectML dll.
-You can download the currently latest nuget installer package from
+To run this version you will most likely need a very recent DirectML dll,
+which you can get by running the included `install.cmd` script. Alternatively,
+you can download the currently latest nuget installer package from
 <https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.10.0>.
 If you don't know how to use nuget installer packages, you can change the
 extension to .zip and open it as a normal zip file, the dll you need is

diff --git a/dist/install-dml.cmd b/dist/install-dml.cmd
@@ -0,0 +1,31 @@
+@echo off
+where /q tar
+if errorlevel 1 goto error
+
+where /q lc0.exe
+if errorlevel 1 cd /d %~dp0
+where /q lc0.exe
+if errorlevel 1 (
+  echo This script must run in the lc0 folder.
+  pause
+  exit /b
+)
+
+cls
+echo Installing the DirectML.dll version required by the Lc0 onnx-dml backend.
+curl -# --ssl-no-revoke -o tmp_directml.zip https://globalcdn.nuget.org/packages/microsoft.ai.directml.1.10.0.nupkg"
+if errorlevel 1 goto error
+
+tar -xzOf tmp_directml.zip bin/x64-win/DirectML.dll >DirectML.dll
+if errorlevel 1 goto error
+
+del /q tmp_directml.zip
+echo Installation successful.
+pause
+exit /b
+
+:error
+cls
+echo Installation failed - see the README for an alternative approach.
+pause
+
diff --git a/libs/lczero-common b/libs/lczero-common
diff --git a/meson.build b/meson.build
@@ -209,7 +209,6 @@ files += [
   'src/utils/random.cc',
   'src/utils/string.cc',
   'src/utils/weights_adapter.cc',
-  'src/utils/fp16_utils.cc',
   'src/version.cc',
 ]
 includes += include_directories('src')
@@ -267,12 +266,14 @@ if get_option('build_backends')
 
   if get_option('blas')
     if get_option('mkl') and mkl_lib.found()
-      add_project_arguments(['-DUSE_MKL', '-DUSE_BLAS'], language : 'cpp')
       mkl_inc = get_option('mkl_include')
       if run_command('scripts/checkdir.py', mkl_inc).returncode() == 0
         includes += include_directories(mkl_inc)
       endif
-      deps += [ mkl_lib ]
+      if cc.has_header('mkl.h')
+        add_project_arguments(['-DUSE_MKL', '-DUSE_BLAS'], language : 'cpp')
+        deps += [ mkl_lib ]
+      endif
 
     elif get_option('dnnl') and dnnl_lib.found()
       add_project_arguments(['-DUSE_DNNL', '-DUSE_BLAS'], language : 'cpp')
@@ -377,6 +378,7 @@ if get_option('build_backends')
 
     if get_option('ispc') and ispc.found()
       files += iscp_gen.process('src/neural/blas/winograd_transform.ispc')
+      files += iscp_gen.process('src/neural/blas/layer_norm.ispc')
       files += iscp_gen.process('src/neural/shared/activation.ispc')
       add_project_arguments('-DUSE_ISPC', language : 'cpp')
     endif
@@ -497,27 +499,32 @@ if get_option('build_backends')
     )
 
     # Handling of fp16 cuda code.
-    nvcc_arch = '-arch=compute_70'
-    nvcc_sm_list = ['sm_80', 'sm_75', 'sm_86', 'sm_70']
+    nvcc_sm_list = ['80', '75', '86', '70', '89', '90']
     if host_machine.system() != 'windows'
-      nvcc_arch = '-arch=compute_60'
-      nvcc_sm_list += ['sm_60']
+      nvcc_sm_list += ['60']
       if ['arm', 'aarch64'].contains(host_machine.cpu_family())
         # Add Jetson versions to the list.
         message('Jetson support enabled.')
-        nvcc_arch = '-arch=compute_53'
-        nvcc_sm_list += ['sm_72', 'sm_62', 'sm_53']
+        nvcc_sm_list += ['72', '62', '53', '87']
       endif
     endif
     # Ignore the given CC for fp16 when it is not in the supported list.
     if cuda_cc == '' or not nvcc_sm_list.contains('sm_' + cuda_cc)
-      nvcc_extra_args = [nvcc_arch]
+      nvcc_extra_args = []
       nvcc_help = run_command(nvcc, '-h').stdout()
       foreach x : nvcc_sm_list
-        if nvcc_help.contains(x)
-          nvcc_extra_args += '-code=' + x
+        if nvcc_help.contains('sm_' + x)
+          nvcc_extra_args += '-gencode=arch=compute_' + x + ',code=sm_' + x
         endif
       endforeach
+      # For forward compatibility.
+      if nvcc_help.contains('sm_90') # Cuda 12+
+        nvcc_extra_args += '-gencode=arch=compute_90,code=compute_90'
+      elif nvcc_help.contains('sm_80') # Cuda 11+
+        nvcc_extra_args += '-gencode=arch=compute_80,code=compute_80'
+      elif nvcc_help.contains('sm_75') # Cuda 10+
+        nvcc_extra_args += '-gencode=arch=compute_75,code=compute_75'
+      endif
     endif
     files += custom_target('cuda fp16 code',
       input : 'src/neural/cuda/fp16_kernels.cu',
@@ -650,6 +657,10 @@ if not get_option('popcnt')
   add_project_arguments('-DNO_POPCNT', language : 'cpp')
 endif
 
+if not get_option('f16c')
+  add_project_arguments('-DNO_F16C', language : 'cpp')
+endif
+
 if not get_option('pext')
   add_project_arguments('-DNO_PEXT', language : 'cpp')
 endif

diff --git a/meson_options.txt b/meson_options.txt
@@ -133,6 +133,11 @@ option('popcnt',
        value: true,
        description: 'Use the popcnt instruction')
 
+option('f16c',
+       type: 'boolean',
+       value: true,
+       description: 'Use natice fp16 conversion instructions')
+
 option('pext',
        type: 'boolean',
        value: false,

diff --git a/scripts/appveyor_win_package.cmd b/scripts/appveyor_win_package.cmd
@@ -27,10 +27,12 @@ IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL
 IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
 IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
 IF %ONNX_DML%==true type dist\README-onnx-dml.txt |more /P > dist\README.txt
+IF %ONNX_DML%==true type dist\install-dml.cmd |more /P > dist\install.cmd
 IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-DML-LICENSE
 IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-DML-ThirdPartyNotices.txt
 IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\lib\onnxruntime.dll"
 IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt
+IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd
 IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-LICENSE
 IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-ThirdPartyNotices.txt
 IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat

diff --git a/src/engine.cc b/src/engine.cc
@@ -132,9 +132,11 @@ void EngineController::UpdateFromUciOptions() {
     if (!syzygy_tb_->init(tb_paths)) {
       CERR << "Failed to load Syzygy tablebases!";
       syzygy_tb_ = nullptr;
-    } else {
-      tb_paths_ = tb_paths;
     }
+    tb_paths_ = tb_paths;
+  } else if (tb_paths.empty()) {
+    syzygy_tb_ = nullptr;
+    tb_paths_.clear();
   }
 
   // Network.

diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h
@@ -1,6 +1,6 @@
 /*
  This file is part of Leela Chess Zero.
- Copyright (C) 2018-2019 The LCZero Authors
+ Copyright (C) 2022-2023 The LCZero Authors
 
  Leela Chess is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -18,37 +18,34 @@
 
 #pragma once
 
-#include <Eigen/Core>
 #include <cmath>
-#include <cstddef>
 
 #include "neural/shared/activation.h"
-#include "utils/exception.h"
 
-namespace lczero {
-
-namespace {
-
-template <typename T>
-using EigenMatrixMap =
-    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
-
-template <typename T>
-using ConstEigenMatrixMap =
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+#ifdef USE_ISPC
+#include "layer_norm_ispc.h"
+#endif
 
-}  // namespace
+namespace lczero {
 
 void LayerNorm2DWithSkipConnection(const size_t batch_size,
                                    const size_t channels, float* data,
-                                   const float* skip, const float* gammas,
-                                   const float* betas, float epsilon) {
+                                   const float alpha, const float* skip,
+                                   const float* gammas, const float* betas,
+                                   float epsilon) {
   for (size_t i = 0; i < batch_size; i++) {
+#ifndef USE_ISPC
     // Mean taken in dimension C.
     float mean = 0;
-    for (size_t c = 0; c < channels; ++c) {
-      data[i * channels + c] += skip[i * channels + c];
-      mean += data[i * channels + c];
+    if (skip != nullptr) {
+      for (size_t c = 0; c < channels; ++c) {
+        data[i * channels + c] += alpha * skip[i * channels + c];
+        mean += data[i * channels + c];
+      }
+    } else {
+      for (size_t c = 0; c < channels; ++c) {
+        mean += data[i * channels + c];
+      }
     }
     mean /= channels;
 
@@ -61,51 +58,22 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
     var /= channels;
 
     // Norm.
+    float den = 1.0f / std::sqrt(var + epsilon);
     for (size_t c = 0; c < channels; ++c) {
-      data[i * channels + c] = betas[c] + gammas[c] *
-                                              (data[i * channels + c] - mean) /
-                                              std::sqrt(var + epsilon);
+      data[i * channels + c] =
+          betas[c] + gammas[c] * (data[i * channels + c] - mean) * den;
     }
-  }
-}
-
-template <bool use_eigen>
-void AttentionMatmul2D(const bool transpose_a, const bool transpose_b,
-                       const size_t batch_size, const size_t M, const size_t N,
-                       const size_t K, const float scaling, const float* input1,
-                       const float* input2, float* output) {
-  for (auto batch = size_t{0}; batch < batch_size; batch++) {
-    const float* A = &input1[batch * M * K];
-    const float* B = &input2[batch * N * K];
-    float* C = &output[batch * M * N];
-    if (use_eigen) {
-      auto C_mat = EigenMatrixMap<float>(C, N, M);
-
-      if (transpose_a && transpose_b) {
-        C_mat.noalias() = scaling *
-                          ConstEigenMatrixMap<float>(B, K, N).transpose() *
-                          ConstEigenMatrixMap<float>(A, M, K).transpose();
-      } else if (transpose_a) {
-        C_mat.noalias() = scaling * ConstEigenMatrixMap<float>(B, N, K) *
-                          ConstEigenMatrixMap<float>(A, M, K).transpose();
-      } else if (transpose_b) {
-        C_mat.noalias() = scaling *
-                          ConstEigenMatrixMap<float>(B, K, N).transpose() *
-                          ConstEigenMatrixMap<float>(A, K, M);
-      } else {
-        C_mat.noalias() = scaling * ConstEigenMatrixMap<float>(B, N, K) *
-                          ConstEigenMatrixMap<float>(A, K, M);
-      }
-    } else {
-#ifdef USE_BLAS
-      cblas_sgemm(CblasRowMajor, transpose_a ? CblasTrans : CblasNoTrans,
-                  transpose_b ? CblasTrans : CblasNoTrans, M, N, K, scaling, A,
-                  transpose_a ? M : K, B, transpose_b ? K : N, 0.0f, C, N);
 #else
-      // Should never get here.
-      throw Exception("Blas backend internal error");
-#endif
+    if (skip != nullptr) {
+      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, alpha,
+                                          skip + i * channels, gammas, betas,
+                                          epsilon);
+    } else {
+      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, 0.0f,
+                                          nullptr, gammas, betas, epsilon);
     }
+
+#endif
   }
 }