Skip to content

Commit

Permalink
Merge branch 'master' into cuda_blas_37
Browse files Browse the repository at this point in the history
  • Loading branch information
borg323 committed Apr 13, 2023
2 parents e3d75f0 + f39ad6c commit a6ce7b6
Show file tree
Hide file tree
Showing 53 changed files with 4,593 additions and 2,053 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,27 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, s

Lc0 can be acquired either via a git clone or an archive download from GitHub. Be aware that there is a required submodule which isn't included in source archives.

For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.28`), which is equivalent to using the latest version tag.
For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.29`), which is equivalent to using the latest version tag.

Versioning follows the Semantic Versioning guidelines, with major, minor and patch sections. The training server enforces game quality using the versions output by the client and engine.


Download using git:

```
git clone -b release/0.28 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
git clone -b release/0.29 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
```

If you have cloned already an old version, fetch, view and checkout a new branch:
```
git fetch --all
git branch --all
git checkout -t remotes/origin/release/0.28
git checkout -t remotes/origin/release/0.29
```


If you prefer to download an archive, you need to also download and place the submodule:
* Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.28) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.28) archive is also available)
* Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.29) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.29) archive is also available)
* Extract
* Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
* Move the second archive into the first archive's `libs/lczero-common/` folder and extract
Expand Down
7 changes: 6 additions & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,15 @@ before_build:
- cmd: IF %DX%==true SET BUILD_BLAS=true
- cmd: SET EMBED=false
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true SET EMBED=true
- cmd: SET POPCNT=true
- cmd: IF %NAME%==cpu-openblas SET POPCNT=false
- cmd: SET F16C=true
- cmd: IF %NAME%==cpu-openblas SET F16C=false
- cmd: IF %CUDA%==true SET F16C=false
- cmd: SET EXTRA=
- cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md
- cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include
- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
- cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64
- cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false
build_script:
Expand Down
6 changes: 5 additions & 1 deletion build.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ set CXX=cl
set CC_LD=link
set CXX_LD=link

if exist "C:\Program Files (x86)\Microsoft Visual Studio\2019" (
if exist "C:\Program Files\Microsoft Visual Studio\2022" (
where /q cl
if errorlevel 1 call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
set backend=vs2022
) else if exist "C:\Program Files (x86)\Microsoft Visual Studio\2019" (
where /q cl
if errorlevel 1 call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
set backend=vs2019
Expand Down
33 changes: 32 additions & 1 deletion changelog.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,35 @@
v0.29.0-rc0 (2022-04-03)
v0.29.0 (2022-12-13)
~~~~~~~
* Updated onednn version to the latest one.

v0.29.0-rc1 (2022-12-09)
~~~~~~~
* New metal backend for apple systems. This is now the default backend for
macos builds.
* New onnx-dml backend to use DirectML under windows, has better net
compatibility than dx12 and is faster than opencl. See the README for use
instructions, a separate download of the DirectML dll is required.
* Full attention policy support in cuda, cudnn, metal, onnx, blas, dnnl, and
eigen backends.
* Partial attention policy support in onednn backend (good enough for T79).
* Now the onnx backends can use fp16 when running with a network file (not with
.onnx model files). This is the default for onnx-cuda and onnx-dml, can be
switched on or off with by setting the `fp16` backend option to `true` or
`false` respectively.
* The onednn package comes with a dnnl compiled to allow running on an intel gpu
by adding `gpu=0` to the backend options.
* The default net is now 791556 for most backends except opencl and dx12 that
get 753723 (as they lack attention policy support).
* Support for using pgn book with long lines in training: selfplay can start at
a random point in the book.
* New "simple" time manager.
* Support for double Fischer random chess (dfrc).
* Added TC-dependent output to the backendbench assistant.
* Starting with this version, the check backend compares policy for valid moves
after softmax.
* Some assorted fixes and code cleanups.

v0.29.0-rc0 (2022-04-03)
~~~~~~~
* Initial support for attention policy, only cuda backend and partially in
blas/dnnl/eigen (good enough for T79).
Expand Down
5 changes: 3 additions & 2 deletions dist/README-onnx-dml.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ Lc0 is a UCI-compliant chess engine designed to play chess via
neural network, specifically those of the LeelaChessZero project
(https://lczero.org).

To run this version you will most likely need a very recent DirectML dll.
You can download the currently latest nuget installer package from
To run this version you will most likely need a very recent DirectML dll,
which you can get by running the included `install.cmd` script. Alternatively,
you can download the currently latest nuget installer package from
<https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.10.0>.
If you don't know how to use nuget installer packages, you can change the
extension to .zip and open it as a normal zip file, the dll you need is
Expand Down
31 changes: 31 additions & 0 deletions dist/install-dml.cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
@echo off
where /q tar
if errorlevel 1 goto error

where /q lc0.exe
if errorlevel 1 cd /d %~dp0
where /q lc0.exe
if errorlevel 1 (
echo This script must run in the lc0 folder.
pause
exit /b
)

cls
echo Installing the DirectML.dll version required by the Lc0 onnx-dml backend.
curl -# --ssl-no-revoke -o tmp_directml.zip https://globalcdn.nuget.org/packages/microsoft.ai.directml.1.10.0.nupkg"
if errorlevel 1 goto error

tar -xzOf tmp_directml.zip bin/x64-win/DirectML.dll >DirectML.dll
if errorlevel 1 goto error

del /q tmp_directml.zip
echo Installation successful.
pause
exit /b

:error
cls
echo Installation failed - see the README for an alternative approach.
pause

2 changes: 1 addition & 1 deletion libs/lczero-common
Submodule lczero-common updated 1 files
+56 −0 proto/net.proto
35 changes: 23 additions & 12 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ files += [
'src/utils/random.cc',
'src/utils/string.cc',
'src/utils/weights_adapter.cc',
'src/utils/fp16_utils.cc',
'src/version.cc',
]
includes += include_directories('src')
Expand Down Expand Up @@ -267,12 +266,14 @@ if get_option('build_backends')

if get_option('blas')
if get_option('mkl') and mkl_lib.found()
add_project_arguments(['-DUSE_MKL', '-DUSE_BLAS'], language : 'cpp')
mkl_inc = get_option('mkl_include')
if run_command('scripts/checkdir.py', mkl_inc).returncode() == 0
includes += include_directories(mkl_inc)
endif
deps += [ mkl_lib ]
if cc.has_header('mkl.h')
add_project_arguments(['-DUSE_MKL', '-DUSE_BLAS'], language : 'cpp')
deps += [ mkl_lib ]
endif

elif get_option('dnnl') and dnnl_lib.found()
add_project_arguments(['-DUSE_DNNL', '-DUSE_BLAS'], language : 'cpp')
Expand Down Expand Up @@ -377,6 +378,7 @@ if get_option('build_backends')

if get_option('ispc') and ispc.found()
files += iscp_gen.process('src/neural/blas/winograd_transform.ispc')
files += iscp_gen.process('src/neural/blas/layer_norm.ispc')
files += iscp_gen.process('src/neural/shared/activation.ispc')
add_project_arguments('-DUSE_ISPC', language : 'cpp')
endif
Expand Down Expand Up @@ -497,27 +499,32 @@ if get_option('build_backends')
)

# Handling of fp16 cuda code.
nvcc_arch = '-arch=compute_70'
nvcc_sm_list = ['sm_80', 'sm_75', 'sm_86', 'sm_70']
nvcc_sm_list = ['80', '75', '86', '70', '89', '90']
if host_machine.system() != 'windows'
nvcc_arch = '-arch=compute_60'
nvcc_sm_list += ['sm_60']
nvcc_sm_list += ['60']
if ['arm', 'aarch64'].contains(host_machine.cpu_family())
# Add Jetson versions to the list.
message('Jetson support enabled.')
nvcc_arch = '-arch=compute_53'
nvcc_sm_list += ['sm_72', 'sm_62', 'sm_53']
nvcc_sm_list += ['72', '62', '53', '87']
endif
endif
# Ignore the given CC for fp16 when it is not in the supported list.
if cuda_cc == '' or not nvcc_sm_list.contains('sm_' + cuda_cc)
nvcc_extra_args = [nvcc_arch]
nvcc_extra_args = []
nvcc_help = run_command(nvcc, '-h').stdout()
foreach x : nvcc_sm_list
if nvcc_help.contains(x)
nvcc_extra_args += '-code=' + x
if nvcc_help.contains('sm_' + x)
nvcc_extra_args += '-gencode=arch=compute_' + x + ',code=sm_' + x
endif
endforeach
# For forward compatibility.
if nvcc_help.contains('sm_90') # Cuda 12+
nvcc_extra_args += '-gencode=arch=compute_90,code=compute_90'
elif nvcc_help.contains('sm_80') # Cuda 11+
nvcc_extra_args += '-gencode=arch=compute_80,code=compute_80'
elif nvcc_help.contains('sm_75') # Cuda 10+
nvcc_extra_args += '-gencode=arch=compute_75,code=compute_75'
endif
endif
files += custom_target('cuda fp16 code',
input : 'src/neural/cuda/fp16_kernels.cu',
Expand Down Expand Up @@ -650,6 +657,10 @@ if not get_option('popcnt')
add_project_arguments('-DNO_POPCNT', language : 'cpp')
endif

if not get_option('f16c')
add_project_arguments('-DNO_F16C', language : 'cpp')
endif

if not get_option('pext')
add_project_arguments('-DNO_PEXT', language : 'cpp')
endif
Expand Down
5 changes: 5 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ option('popcnt',
value: true,
description: 'Use the popcnt instruction')

option('f16c',
type: 'boolean',
value: true,
description: 'Use natice fp16 conversion instructions')

option('pext',
type: 'boolean',
value: false,
Expand Down
2 changes: 2 additions & 0 deletions scripts/appveyor_win_package.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ IF %NAME%==onednn copy "%PKG_FOLDER%\%DNNL_NAME%\THIRD-PARTY-PROGRAMS" dist\DNNL
IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-LICENSE
IF %NAME%==onednn 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\DNNL-THIRD-PARTY-PROGRAMS
IF %ONNX_DML%==true type dist\README-onnx-dml.txt |more /P > dist\README.txt
IF %ONNX_DML%==true type dist\install-dml.cmd |more /P > dist\install.cmd
IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\LICENSE" dist\ONNX-DML-LICENSE
IF %ONNX_DML%==true copy "%PKG_FOLDER%\%ONNX_NAME%\ThirdPartyNotices.txt" dist\ONNX-DML-ThirdPartyNotices.txt
IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\%ONNX_NAME%\lib\onnxruntime.dll"
IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\README.txt
IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\install.cmd
IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-LICENSE
IF %ONNX_DML%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip .\dist\ONNX-DML-ThirdPartyNotices.txt
IF %OPENCL%==true type scripts\check_opencl.bat |more /P > dist\check_opencl.bat
Expand Down
6 changes: 4 additions & 2 deletions src/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,11 @@ void EngineController::UpdateFromUciOptions() {
if (!syzygy_tb_->init(tb_paths)) {
CERR << "Failed to load Syzygy tablebases!";
syzygy_tb_ = nullptr;
} else {
tb_paths_ = tb_paths;
}
tb_paths_ = tb_paths;
} else if (tb_paths.empty()) {
syzygy_tb_ = nullptr;
tb_paths_.clear();
}

// Network.
Expand Down
92 changes: 30 additions & 62 deletions src/neural/blas/encoder.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
This file is part of Leela Chess Zero.
Copyright (C) 2018-2019 The LCZero Authors
Copyright (C) 2022-2023 The LCZero Authors
Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -18,37 +18,34 @@

#pragma once

#include <Eigen/Core>
#include <cmath>
#include <cstddef>

#include "neural/shared/activation.h"
#include "utils/exception.h"

namespace lczero {

namespace {

template <typename T>
using EigenMatrixMap =
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;

template <typename T>
using ConstEigenMatrixMap =
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
#ifdef USE_ISPC
#include "layer_norm_ispc.h"
#endif

} // namespace
namespace lczero {

void LayerNorm2DWithSkipConnection(const size_t batch_size,
const size_t channels, float* data,
const float* skip, const float* gammas,
const float* betas, float epsilon) {
const float alpha, const float* skip,
const float* gammas, const float* betas,
float epsilon) {
for (size_t i = 0; i < batch_size; i++) {
#ifndef USE_ISPC
// Mean taken in dimension C.
float mean = 0;
for (size_t c = 0; c < channels; ++c) {
data[i * channels + c] += skip[i * channels + c];
mean += data[i * channels + c];
if (skip != nullptr) {
for (size_t c = 0; c < channels; ++c) {
data[i * channels + c] += alpha * skip[i * channels + c];
mean += data[i * channels + c];
}
} else {
for (size_t c = 0; c < channels; ++c) {
mean += data[i * channels + c];
}
}
mean /= channels;

Expand All @@ -61,51 +58,22 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
var /= channels;

// Norm.
float den = 1.0f / std::sqrt(var + epsilon);
for (size_t c = 0; c < channels; ++c) {
data[i * channels + c] = betas[c] + gammas[c] *
(data[i * channels + c] - mean) /
std::sqrt(var + epsilon);
data[i * channels + c] =
betas[c] + gammas[c] * (data[i * channels + c] - mean) * den;
}
}
}

template <bool use_eigen>
void AttentionMatmul2D(const bool transpose_a, const bool transpose_b,
const size_t batch_size, const size_t M, const size_t N,
const size_t K, const float scaling, const float* input1,
const float* input2, float* output) {
for (auto batch = size_t{0}; batch < batch_size; batch++) {
const float* A = &input1[batch * M * K];
const float* B = &input2[batch * N * K];
float* C = &output[batch * M * N];
if (use_eigen) {
auto C_mat = EigenMatrixMap<float>(C, N, M);

if (transpose_a && transpose_b) {
C_mat.noalias() = scaling *
ConstEigenMatrixMap<float>(B, K, N).transpose() *
ConstEigenMatrixMap<float>(A, M, K).transpose();
} else if (transpose_a) {
C_mat.noalias() = scaling * ConstEigenMatrixMap<float>(B, N, K) *
ConstEigenMatrixMap<float>(A, M, K).transpose();
} else if (transpose_b) {
C_mat.noalias() = scaling *
ConstEigenMatrixMap<float>(B, K, N).transpose() *
ConstEigenMatrixMap<float>(A, K, M);
} else {
C_mat.noalias() = scaling * ConstEigenMatrixMap<float>(B, N, K) *
ConstEigenMatrixMap<float>(A, K, M);
}
} else {
#ifdef USE_BLAS
cblas_sgemm(CblasRowMajor, transpose_a ? CblasTrans : CblasNoTrans,
transpose_b ? CblasTrans : CblasNoTrans, M, N, K, scaling, A,
transpose_a ? M : K, B, transpose_b ? K : N, 0.0f, C, N);
#else
// Should never get here.
throw Exception("Blas backend internal error");
#endif
if (skip != nullptr) {
ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, alpha,
skip + i * channels, gammas, betas,
epsilon);
} else {
ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, 0.0f,
nullptr, gammas, betas, epsilon);
}

#endif
}
}

Expand Down
Loading

0 comments on commit a6ce7b6

Please sign in to comment.