diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh index f1f0fa4c105..01040f5932e 100755 --- a/.ci/daint.cscs.ch/ocl.build.sh +++ b/.ci/daint.cscs.ch/ocl.build.sh @@ -35,7 +35,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then fi cd "${HOME}/libxsmm" git fetch -git checkout d009b33e8742a93c9e1549323587fb6197451294 +git checkout 488aa88f2a9825e9f92a0cfc773c1aedf019f88a make -j cd .. diff --git a/.github/workflows/testing-linux.yml b/.github/workflows/testing-linux.yml index d9ef4fa8fac..ab5f5d1b5e2 100644 --- a/.github/workflows/testing-linux.yml +++ b/.github/workflows/testing-linux.yml @@ -74,16 +74,16 @@ jobs: mv build/coverage.info build/coverage-Linux-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-cpu.info - name: Upload coverage data - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - name: coverage-data + name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }} path: build/coverage-*.info - name: Upload coverage data (generated files) - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 if: matrix.use_mpi == 'MPI=ON' && matrix.use_openmp == 'OPENMP=ON' && matrix.use_smm == 'SMM=blas' && matrix.mpi_suffix == 'openmpi' with: - name: coverage-data + name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }}-generated-files path: | build/src/dbcsr.h build/src/tensors/dbcsr_tensor.h @@ -200,9 +200,10 @@ jobs: - uses: actions/checkout@v4 - name: Download coverage data - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4.1.7 with: - name: coverage-data + pattern: coverage-data-* + merge-multiple: true - name: Combine coverage run: | @@ -213,7 +214,7 @@ jobs: lcov --summary merged.info - name: Upload merged HTML report - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: html-report path: htmlcov diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0c79251adc..6b71c63041a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ fail_fast: false minimum_pre_commit_version: 3.2.0 repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: 'v0.5.4' + rev: 'v0.8.2' hooks: - id: ruff args: [ --fix, --exit-non-zero-on-fix ] @@ -15,19 +15,19 @@ repos: .cp2k/.*| )$ - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.10.0 hooks: - id: black name: Reformat Python files with the black code formatter files: '^.*(/PACKAGE)|(\.py)$' - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-ast - id: check-yaml - id: check-symlinks - id: trailing-whitespace -- repo: https://github.com/pseewald/fprettify +- repo: https://github.com/fortran-lang/fprettify rev: v0.3.7 hooks: - id: fprettify @@ -65,4 +65,5 @@ repos: language: python files: \.(c|cc|cxx|cpp|cl|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|mm|proto|textproto|vert)$ args: ['-i', '-fallback-style=none', '--style=file'] - additional_dependencies: ['clang-format'] + # specify version since clang-format is not stable version-to-version + additional_dependencies: ['clang-format~=19.1.0'] diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b3f9f569d0..e819bbc23a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,13 +120,7 @@ set_property(CACHE WITH_GPU PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES} option(WITH_CUDA_PROFILING "Enable profiling within CUDA" OFF) option(WITH_HIP_PROFILING "Enable profiling within HIP" OFF) -option(WITH_G2G "Enable GPU aware MPI within CUDA/HIP backends" OFF) -if (WITH_G2G AND ((NOT USE_ACCEL) OR ((NOT USE_ACCEL MATCHES "cuda") - AND (NOT USE_ACCEL MATCHES "hip")))) - message( - FATAL_ERROR "GPU aware MPI can only be enabled for HIP/CUDA GPU backends") -endif () # ================================================================================================= # LANGUAGES AND TESTING enable_language(Fortran) @@ -274,7 +268,6 @@ if (USE_ACCEL MATCHES "cuda") message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS}) message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) message(STATUS "GPU profiling enabled: " ${WITH_CUDA_PROFILING}) - message(STATUS "GPU aware MPI enabled: " ${WITH_G2G}) endif () if (USE_ACCEL MATCHES "hip") @@ -319,7 +312,6 @@ if (USE_ACCEL MATCHES "hip") message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS}) message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) message(STATUS "GPU profiling enabled: " ${WITH_HIP_PROFILING}) - message(STATUS "GPU aware MPI enabled: " ${WITH_G2G}) # =================================== BLAS on GPU backend find_package(hipblas CONFIG REQUIRED HINTS ${ROCM_PATH}) diff --git a/VERSION b/VERSION index 41222dc7221..71bea4b7ee9 100644 --- a/VERSION +++ b/VERSION @@ -1,8 +1,8 @@ MAJOR = 2 -MINOR = 7 +MINOR = 8 PATCH = 0 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise # it is considered Development version. -DATE = 2024-07-29 +DATE = 2024-12-11 diff --git a/cmake/CompilerConfiguration.cmake b/cmake/CompilerConfiguration.cmake index b2b68dc8d8e..5d418744386 100644 --- a/cmake/CompilerConfiguration.cmake +++ b/cmake/CompilerConfiguration.cmake @@ -51,9 +51,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if ((NOT (USE_MPI)) OR (NOT ("${MPI_Fortran_LIBRARY_VERSION_STRING}" MATCHES "Open MPI"))) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=leak") endif () - if (USE_ACCEL MATCHES "hip" AND hip_VERSION GREATER_EQUAL 6.0.0) # Remove deprecated function error with ROCm v6+ - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations") - endif () elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops") set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -g --coverage") diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md index 1c091558026..a3f1d362052 100644 --- a/docs/guide/2-user-guide/1-installation/index.md +++ b/docs/guide/2-user-guide/1-installation/index.md @@ -70,7 +70,6 @@ make -DUSE_ACCEL= -DWITH_CUDA_PROFILING= -DWITH_HIP_PROFILING= --DWITH_G2G= -DWITH_C_API= -DWITH_EXAMPLES= -DWITH_GPU= diff --git a/docs/guide/3-developer-guide/3-programming/1-overview/index.md b/docs/guide/3-developer-guide/3-programming/1-overview/index.md index 27f6bda40d0..d55b9b3f30f 100644 --- a/docs/guide/3-developer-guide/3-programming/1-overview/index.md +++ b/docs/guide/3-developer-guide/3-programming/1-overview/index.md @@ -55,4 +55,3 @@ Assumed square matrix with 20x20 matrix with 5x5 blocks and a 2x2 processor grid | `__CUDA_PROFILING` | To turn on Nvidia Tools Extensions. It requires to link `-lnvToolsExt` | Fortran, C, C++ | | `__CUDA` | Enable CUDA acceleration | C, C++ | | `__HIP` | Enable HIP acceleration | C, C++ | -| `__DBCSR_ACC_G2G` | Enable GPU Aware MPI in CUDA and HIP backends | Fortran, C, C++ | diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md index 44729500ba9..9e54d016be7 100644 --- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md +++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md @@ -14,9 +14,3 @@ The batched matrix-matrix multiplication kernels are templated on: The batched transpose kernels are templated on: * the characteristic dimensions of the transpose: `m, n` - -## Predictive parameters - -The input features for the predictive models can be 'raw' parameters (left-most-column in the figure below), or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations). - -![libsmm_acc_predictive_modeling_features](../../../../../media/images/libsmm_acc_predictive_modeling_features.png) diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/4-predict.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/4-predict.md deleted file mode 100644 index ce0ff183dd0..00000000000 --- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/4-predict.md +++ /dev/null @@ -1,3 +0,0 @@ -title: Predictive Modeling Framework - -{!./src/acc/libsmm_acc/predict/README.md!} diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/5-notebooks.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/5-notebooks.md deleted file mode 100644 index f450b61b1ad..00000000000 --- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/5-notebooks.md +++ /dev/null @@ -1,3 +0,0 @@ -title: Notebooks - -{!./src/acc/libsmm_acc/notebooks/README.md!} diff --git a/docs/media/images/README.md b/docs/media/images/README.md deleted file mode 100644 index cf42b988ac8..00000000000 --- a/docs/media/images/README.md +++ /dev/null @@ -1,3 +0,0 @@ -#### libsmm_acc predictive_modelling_features - -The XML file `libsmm_acc_predictive_modelling_features.xml` can be opened in [www.draw.io](www.draw.io) to be edited. diff --git a/docs/media/images/libsmm_acc_predictive_modeling_features.png b/docs/media/images/libsmm_acc_predictive_modeling_features.png deleted file mode 100644 index 69df06ea0eb..00000000000 Binary files a/docs/media/images/libsmm_acc_predictive_modeling_features.png and /dev/null differ diff --git a/docs/media/images/libsmm_acc_predictive_modeling_features.xml b/docs/media/images/libsmm_acc_predictive_modeling_features.xml deleted file mode 100644 index 1a3b13b6908..00000000000 --- a/docs/media/images/libsmm_acc_predictive_modeling_features.xml +++ /dev/null @@ -1 +0,0 @@ -7Z1tc5s6FoB/TWZ270w7IN4/Nmncnbnt3c62u529XzzYEIcNBi+GJumvvxIgDOiA7QRhBx/nQxIZY0DPOdJ50dGVdrN++pS4m/svseeHV0Txnq60j1eEEMew6C/W8ly0qCZxipZVEnhl267hW/DLLxuVsjULPH/bODCN4zANNs3GZRxF/jJttLlJEj82D7uLw+a3btyVLzR8W7qh2Poj8NL7otUm1q79H36wuuffrJrl/S3c5cMqibOo/L4ros3yV/H22uXnKm90e+968WOtSbu90m6SOE6Lv9ZPN37IHi5/bMXnZh3vVted+FF6yAe+3mbZD33z9O1/f/56sG82vyJdf0eIUZznpxtm5RP54qZJ8MSumHbW9oqY9M5ob2nX5X2kz/zZbR+DdehG9L/ruyAMb+IwTvJ32JPQbzTavkpcL6BXWHvvLn+xz8RROnPXQcjYuYmzJPAT+gV/+I/lmyUtRCn/r51EyV+0PXQXfvjV9bwgWtG3WNM2cjff469xkD8YlbWUFzprXmV1Jds0iR986Prjn35yF+b9xm6xegr15152BT0y9Z9qTWU/fPLjtZ8mz/SQ8l1dJcVHSqFxSkQedwASxS7a7uvw8QPdEvpVdepdx9M/yr6HOfhP9ueN+vn3L7b9y7ve+N+jn873d0TVBA7Wv0W/PdDuD+m3Xy9oz5irFKKA3jRrv0/XYfm4E5+i4y7yA1iHbFhX5JdsXF8ZH2mLm6Xxtuhd9gE3DFYR/XtJnyllQLtmzzKgQvqhfGMdeB473fV24y5pT3+PN+W590NUXqkCQHUfJ8Ev2ubyK89puq4kmxMRxVHty/nJBJjIcgnB5JkL0zDZc2Hn9D0OJVcI7GSr0N1uy7+X8TpYlgeJtO2R44MhVLkGLiG0jfeGgKGtiBTqA0BoC7QxGuZLxG2yuDVpUzVTgM2RBJsFw7ZA2C4ENlC1yaLNhGlzkbYLoU0bj7WOOxAR7Ju613Bb3geh99l9jjN2kduUdh//r0DR/8qfn9pq+uI+lQ9dQIGeJ+GwNA/4xr4BhuD2w2x26wAWBCfoqCl9+aEB5u6q3tItFqBbiAF0uKpZ8rpcHOO4vmGSmtvP/BmZ/8/iQv1QK8o0qRVVa1Lu3KXfOKypCeqH7nDijey73hXa6AM9gD7qp/onCrX34CcR8yIUl0fvt7jCDp0Ig1pHStQ/fTiBXBzHWHWug1WKeTBl+yAy7AEYqqjf0cK09IZp5PMbk9pegM4xqmdQaA9fwshik4VmmiI3nuHbng4gQt9zLE+xrNaQc8y4dzA/1sH88HeJ9d7OaTF1QzVVPgKUSku3FUBrVRbh0MMUd4HVeEvvE9/1tnNxVrTxk/kijJdvw/WAcL4ATtVu0sj1UZ1FAim/AVhURdfDT+RqElxVPlM+EbdEv6omScWpjoDVI2I1TaxUaMovDSxxZp8GoT+PcHicKG+q3uSNAGpMlkNBFV0HOW1rZGuabJER2RJDzW64ipGsaZBltciCLExTEloiWQ+I1TSwcppYqcBgqEuiSkyJiM6Mqj2gHJVcc9FUgdpKFldE4Oocp1dnEQqs+Oomso88mWHCV0NnjcocAF0Q5X7W7RnCh0Pl692qhgY6+SHHqsRYtOhubQUm5QccP339N3vkSbxhDOYprVJDj2CIsSOWOCPsR2Swaj+Yl8NjjJwPW5xKwTHHIeLWGpBg6rIsY9GPlfgrerYiQjRfhBc7lTfzl8jGncF+YP20yH/G0E/2wbw9cYze29ruxYEtaTQJpKyg/NMhokCaLtCYU6e4IR0S34NYZlHA9FUOERI5BSLtXiJVootAgvpxCCBFF8ajm2z6iaRPO8pCNwnSZyRyEkRa/UTagIqEfGqDEAnk6K/9NepIJLLm5dV1aNiWxqQYnqLDNp3Qz7bn6EJB3o7nTdV6gdMUEDhFlgsYcJvkVgsbnJG7CXHXr+c040AtNwh0Ynpul6lcGcmFHw9xnAqOpIWj0UqZVMBlB1AC7yDjrpjdVswFEbipALdn3NVNe8xxl5+jARz90GqTIW/T4E3v5c1QId5kpSZpIm7lAgX6ycIPg8mWk8Rwj8NFG2/W13FH4sg7YvwMXjXoOIqie2KcjdGcZlG+ikxuhO249Xqzm1t9JiYRlB9qB974wQczd3TgTa28yPsib1V65qtC/yJCEb2tNx32b3Wab3qK50PqynFM5jUF1ZJnWJrjifrwZarHORgD/q7RTkISoJBVfgUqUsCWYs/fuuN2Clw49nvV3L0M0qDEBIJS0pKG4Mop8d2cldZCTE6MSVN7qIB5Jg0MuH4TBcOLM9bfyMZJ2TBbbJhisaUqkXqceSxg0FMC6KOnk1eiZFtWpZEof0uD6PnvLyw5aN5+nM3g7thVBdlTNaTHMqonidbtGLHaoFgzcBMn9HIiipBbHFbOPZvt/4pTNw3iiFMbPH10t/flf7tyk//M0jCIOg20m5tbI38OR2WAC9IlwvmCMod6A0PLFGc4mm6KGBqc31ctNwIyzpJszgibr92nOb01Nwz9cP4YJ286xayrMuZrrPyZeqsVELWEjLe/1JJ/Sap1pTsO13/cd1RVGBDBg/JgtQH0nwokTlTcBdG8ciwhcRMmzjah+Aw0HxuEOb1b1y2yu7dv1iFwwvS/BZxKQOKgiOAwxAGpOCVxLDQ4r5XxQe6mzB2vh16nDvJbDUEdfBMWYFuUKEaL8GGbs8jCh1OK4SCae9EkqgqpRCiAIxFOMZmHg0ifUVSteeHA5ollLWJZnkXtWMR22tga4EgO+WskYku6derbzvlBAoGVow0ANQXM9YEClBIB7LZoojeeBYQEAtVj9JYW7IBw5MEbWCZYQhgvlxkdk5dveuUVkrh3NNYMMXZCZPkO//vu+ofyPb4zt8/Pm8/Jz89Z9u1dlRhQo/D3OL0vOPyDGdu0hxLaXUjiOCSeOOQjCoKA9+Gy0S0IDTHQFUAMZE1JO8RAnBG0xGAVxgsUAhSC4YSgFcHU9VHd+h1yIE5KmnKAIoAiIE8EwMgWWKhDogiI1RJ2IrChXTD/HYUAhUCaEBjjBts6LhlaDC+mVG3XbhhKzak6LQBNoWhkZnWlb8nKmdqXXSbCeHROlWVZDRLBnCpFFzlUHUcSiJoixt9ybZyThwYq6uSDFPDhy2h4PbGmn0aDNPKogWhNEWN9bUFAExXFYFgxaO0+aYnrQ8aNeGtKh7tyJwUoACgA0gRAVQEJgOoUSJSADk9lIQFon6IMyJYBA5ABWUGrjnsQ1w5C1una94JsLdM8FXt7IPPvuLVEEmw/hyfpNGw/qJeNAdbTwN1si6oujF1vnkVJTHXdnbtM42SuorZDbbdXU7zU8jv1jJfYYkQGEAKCQjCSEIxF3sltLWKLgZBovn5g9G0x333yAEK2DrjBh0QCxShENH+IkMCpEqjuPK1nQ2EV8Whb3IV5gTEInIkOLAyVoW2JUwCoSqFE9DtyIeroY9QBwZc6D3HAcsSjLlnSFHEm0hYElAGUAVkyQLSDS3JLlAHR61qXAQw9oBRIlgILlAJt3NgDUIEHiD2EbrLyP17T27shmCA3kQQ5VXH2Vx3TefHEOo8mn0C8qi6KA1Z/ylGbf3VR9Z7zYsHji9g3w67gDEDauukqoxNEDe29iaFmGSdkjQA5vjvWlsjapFjT9FPqNQKk0VasNUOqOJ5OC7xdwSZeRhHawh7OcRmEPHIoeTi8Tpw8lUALCiSiB9a7gdDD0Xbq6JkjowdWuSnQC+N4M1+76ToLEbtpY0dHv9OXS3R67AxWtX0ZR8ssYU8D67ZfApM6xKQ2bik6p8ceYduP59U6i7LuyOO0eCRNGm1jzIG5g8ae0Rorb08ZxrZTBhyuR00+Ik6P1YIbD0wSQ2K0B2mNbwd2utVmTo/jJpp/vJ4HDBMEcUogtvUhgQrGSss8gAtTKWL0N8+/KUjENOTxWTy7HJyXlaeyjxEFXQWTcMat0aaI0em2KGBaMgqCXEEwQEGQ5dLqKFbYUR5rJwgoAygD8mQAzs2X5bTokIGOyliFDGBaMkqBZCkw4Oz8ca1UAwhudFXsvCL0S5RXlUepknYxORmQ51MkJzvt0hVcC9eYNPjEubEx9yAJo4a4VDDJiqpUFDRMM+jXx9btB/NW6ROzEd0llSo53G/XKhHFN3hobMYtK1XZBIMWFXmYZXAp1PFtvcYIWagmuBajog53gb8o9ih8osaT5QpQgX0/6+ydR3BMBmHHQ9SanZbE7OGpD+TGNLJp+rRl7DQkmiM6pVR4K8+KRMwZuBwVqEEqcNy1uTZgA9doTNdsWEYKJ03hGSwRt/uH5zRCDKeOoQ3mT42MYf/YjNmlF8SjBswKpXlkOu4B0IqAg5pts1b4p8d1VR9VcLvTf9ywDk7r/ZblyhaZPdpVrbdynzVFBdSlDvluqiIyrzJbCFBAvth7u8NhjYuBz1o9Hk3gIbNGTZrvUAPtlB78MJNp0vhxg2EM+OBbMI8l0p2ndFAKsQrzZZGqEmCcHncVpwkuC+lTnojqRaJqQgvqxoYVLKPVA+sSYb1EWIkCzUBHhtU4aFutmhHeZ6wXfw9QBhMzzY7LNEvdhB9h6FejpJ7ZTmvzMDHzTHPAfaOHsKYIOCFgyRfrOX0ay4emikVFes6K9AVbhjTQM8XloiNXdDDExKAIl4hOHjwVHMHHteINcbb5iY6Bmy2iN2n0dP3k6FlA+vchyw9q00ScH05kJYKuNqukG6Y4H9T5lLExH+RbDkrgU9zApRYML5aKLTEt47w1ZaVkXl70Szv1VkLEAmOPTRITJHFyJLZ2jhBV4tgcgq7JJodrylOeHYQzyEnTqOri0pmR04Os3vU0BY7fkcJJUwia0CNnTWqiIRO6WbS8L56BS7/GT0R12LON0wf2M3375NZkP2/BPjE1p8Ed4K4mkPIbxF1titO/aBE+THyAFVDaq9qu8x9RnPz8NbJq045Xba0EM8hDI6tmFp9H1hB7dJPNtlyJNfF1WJeOmgWhpsqyLSwx+BblsJ0WMhlMdWLzitWkhxAnYZXp8ZA59n5HiiyPM7Csb7uee5SeyRe7vHRlpnIToa7LZBmmgI/4MtwfF0+ZAS3XUwfaXpv+m8RxWnvvU+Ju7r/Ens+O+As= \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3f64deea382..31b89858369 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -284,18 +284,6 @@ if (USE_ACCEL) $<$:roctx64> $<$:roctracer64> $<$:OpenCL::OpenCL>) - - if (WITH_G2G) - target_compile_definitions( - dbcsr - PRIVATE __DBCSR_ACC_G2G - $<$:__CUDA> - $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> - $<$:__HIP> - $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> - $<$:__CUDA_PROFILING> - $<$:__HIP_PROFILING>) - endif () endif () # ================================================================================================= diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c index 79bf0625f03..26bd167ca41 100644 --- a/src/acc/acc_bench_smm.c +++ b/src/acc/acc_bench_smm.c @@ -222,21 +222,25 @@ int main(int argc, char* argv[]) { #endif CHECK(libsmm_acc_init(), &result, check); /* note: libsmm_acc_init() may imply acc_init() */ if (EXIT_SUCCESS == result) { - const char* const env_device = getenv("DEVICE"); - const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device)); int ndevices = 0; result = c_dbcsr_acc_get_ndevices(&ndevices); - if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) { - printf("Activated device%i (ndevices=%i)\n", device, ndevices); - } - else { - if (0 >= ndevices) { - fprintf(stderr, "ERROR: No ACC-device found!\n"); + if (EXIT_SUCCESS == result && 0 < ndevices) { + const char* const env_device = getenv("DEVICE"); + const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK")); + const int rank = (NULL != env_rank ? atoi(env_rank) : -1); + int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device)); + device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1); + result = c_dbcsr_acc_set_active_device(device); + if (EXIT_SUCCESS == result) { + printf("Activated device%i (ndevices=%i)\n", device, ndevices); } else { - fprintf(stderr, "ERROR: Failed to activate device %i of %i!\n", device, ndevices); + fprintf(stderr, "ERROR: Failed to activate device!\n"); } - result = EXIT_FAILURE; + } + else { + fprintf(stderr, "ERROR: No ACC-device found!\n"); + if (EXIT_SUCCESS == result) result = EXIT_FAILURE; } if (EXIT_SUCCESS == result) { rnd = (int*)malloc(sizeof(int) * NRAND); @@ -280,7 +284,7 @@ int main(int argc, char* argv[]) { #if defined(USE_LIBXSMM) libxsmm_timer_tickint start; int print_offset = 0; - char print_buffer[1024]; + char print_buffer[1024] = ""; # if defined(__OPENCL) const char* const env_smm_repeat = getenv("SMM_NREPEAT"); const int smm_nrepeat = (NULL == env_smm_repeat ? 1 : MAX(atoi(env_smm_repeat), 1)); @@ -497,7 +501,7 @@ int main(int argc, char* argv[]) { if (maxdiff < epsilon && NULL != file) maxdiff = epsilon; if (0 < epsilon) { if (LIBXSMM_NOTNAN(diff.v_tst)) { - PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, fabs(diff.v_ref - diff.v_tst)); + PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, diff.linf_abs); } else { PRINTF(" (%g)\n", diff.v_tst); @@ -508,6 +512,7 @@ int main(int argc, char* argv[]) { } if (0 < check && check < epsilon) result = EXIT_FAILURE; } + else fprintf(stderr, "ERROR: failed to validate!\n"); } # endif } diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c index 07101a187df..2f9485b839c 100644 --- a/src/acc/acc_bench_trans.c +++ b/src/acc/acc_bench_trans.c @@ -106,52 +106,48 @@ int main(int argc, char* argv[]) { #else const int warmup = 0; #endif - const char* const env_device = getenv("DEVICE"); - const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device)); int *stack_hst = NULL, *stack_dev = NULL; ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL; - int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n; + int result = EXIT_SUCCESS, mm = m, nn = n, r, i; void* stream = NULL; #if defined(USE_LIBXSMM) libxsmm_timer_tickint start; double duration; #endif assert(m <= (mn / n) && 0 == (mn % n)); + if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) { + fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n"); + result = EXIT_FAILURE; + } CHECK(c_dbcsr_acc_init(), &result); /* note: libsmm_acc_init() may imply acc_init() */ CHECK(libsmm_acc_init(), &result); if (EXIT_SUCCESS == result) { + int ndevices = 0; result = c_dbcsr_acc_get_ndevices(&ndevices); - if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) { - printf("Activated device%i (ndevices=%i)\n", device, ndevices); - } - else { - if (0 >= ndevices) { - fprintf(stderr, "No ACC-device found!\n"); + if (EXIT_SUCCESS == result && 0 < ndevices) { + const char* const env_device = getenv("DEVICE"); + const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK")); + const int rank = (NULL != env_rank ? atoi(env_rank) : -1); + int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device)); + device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1); + result = c_dbcsr_acc_set_active_device(device); + if (EXIT_SUCCESS == result) { + printf("Activated device%i (ndevices=%i)\n", device, ndevices); + printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n); + printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE)); } else { - fprintf(stderr, "Failed to activate device %i of %i!\n", device, ndevices); + fprintf(stderr, "ERROR: Failed to activate device!\n"); } -#if !defined(__CUDA) - CHECK(libsmm_acc_finalize(), NULL); -#endif - CHECK(c_dbcsr_acc_finalize(), NULL); - return result; + } + else { + fprintf(stderr, "ERROR: No ACC-device found!\n"); + if (EXIT_SUCCESS == result) result = EXIT_FAILURE; } } else { fprintf(stderr, "ACC initialization failed!\n"); -#if !defined(__CUDA) - CHECK(libsmm_acc_finalize(), NULL); -#endif - CHECK(c_dbcsr_acc_finalize(), NULL); - return result; - } - printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n); - printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE)); - if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) { - fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n"); - result = EXIT_FAILURE; } #if defined(PRIORITY) CHECK(c_dbcsr_acc_stream_priority_range(&priomin, &priomax), &result); @@ -259,7 +255,7 @@ int main(int argc, char* argv[]) { CHECK(c_dbcsr_acc_finalize(), NULL); if (EXIT_SUCCESS != result) { if (-1 != result) { - fprintf(stderr, "FAILED\n"); + fprintf(stderr, "\nFAILED\n\n"); } else { fprintf(stderr, "Kernel not suitable!\n"); diff --git a/src/acc/acc_libsmm.h b/src/acc/acc_libsmm.h index 06957d74074..93b1623f8bb 100644 --- a/src/acc/acc_libsmm.h +++ b/src/acc/acc_libsmm.h @@ -15,11 +15,11 @@ #define DBCSR_TYPE_double dbcsr_type_real_8 #define DBCSR_TYPE_float dbcsr_type_real_4 -#define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STRPTR ((const char**)&libsmm_acc_transpose_routine_name_ptr) +#define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STRPTR ((const char**)((uintptr_t)&libsmm_acc_transpose_routine_name_ptr)) #define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_LENPTR (&libsmm_acc_transpose_routine_name_len) #define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STR (libsmm_acc_transpose_routine_name_str) -#define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR ((const char**)&libsmm_acc_process_routine_name_ptr) +#define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR ((const char**)((uintptr_t)&libsmm_acc_process_routine_name_ptr)) #define LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR (&libsmm_acc_process_routine_name_len) #define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STR (libsmm_acc_process_routine_name_str) diff --git a/src/acc/cuda/Makefile b/src/acc/cuda/Makefile index 2aedadeb979..6f6c66b2369 100644 --- a/src/acc/cuda/Makefile +++ b/src/acc/cuda/Makefile @@ -103,23 +103,15 @@ ifneq (,$(ELEM_TYPE)) DFLAGS += -DELEM_TYPE=$(ELEM_TYPE) endif -ifeq (1,$(INTEL)) - CXX := icpc - CC := icc - AR := xiar -else ifneq (0,$(INTEL)) - CXX := icpx - CC := icx - AR := xiar -else ifneq (0,$(GNU)) - override CXX := g++ - override CC := gcc - ifneq (Darwin,$(UNAME)) - override AR := gcc-ar +ifneq (0,$(INTEL)) + ifneq (1,$(INTEL)) + CXX := icpx + CC := icx else - override AR := ar + CXX := icpc + CC := icc endif - #override LD_LIBRARY_DIRS := $(NULL) + AR := $(if $(call which,xiar),xiar,ar) else CXX := g++ CC := gcc diff --git a/src/acc/cuda/dbcsr_cuda_profiling.F b/src/acc/cuda/dbcsr_cuda_profiling.F index ea9bdf46b29..d9364381bb8 100644 --- a/src/acc/cuda/dbcsr_cuda_profiling.F +++ b/src/acc/cuda/dbcsr_cuda_profiling.F @@ -17,7 +17,7 @@ MODULE dbcsr_cuda_profiling int_8 #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads +!$ USE OMP_LIB, ONLY: omp_get_thread_num IMPLICIT NONE diff --git a/src/acc/cuda_hip/acc_init.cpp b/src/acc/cuda_hip/acc_init.cpp index b1e70178c0f..2a4b684ded3 100644 --- a/src/acc/cuda_hip/acc_init.cpp +++ b/src/acc/cuda_hip/acc_init.cpp @@ -26,8 +26,10 @@ extern "C" int c_dbcsr_acc_init() { ACC_DRV(device) acc_device; ACC_API_CALL(GetDevice, (&myDevice)); ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice)); +#if defined(__CUDA) ACC_DRV(context) ctx; ACC_DRV_CALL(DevicePrimaryCtxRetain, (&ctx, acc_device)); +#endif ACC_API_CALL(RuntimeGetVersion, (&runtimeVersion)); // Initialize libsmm_acc, DBCSR's GPU backend @@ -41,6 +43,8 @@ extern "C" int c_dbcsr_acc_finalize() { ACC_DRV(device) acc_device; ACC_API_CALL(GetDevice, (&myDevice)); ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice)); +#if defined(__CUDA) ACC_DRV_CALL(DevicePrimaryCtxRelease, (acc_device)); +#endif return libsmm_acc_finalize(); } diff --git a/src/acc/dbcsr_acc_device.F b/src/acc/dbcsr_acc_device.F index 7b4d29f25c6..d9ec94526e3 100644 --- a/src/acc/dbcsr_acc_device.F +++ b/src/acc/dbcsr_acc_device.F @@ -13,6 +13,8 @@ MODULE dbcsr_acc_device #endif #include "base/dbcsr_base_uses.f90" +!$ USE OMP_LIB, ONLY: omp_get_level + IMPLICIT NONE PUBLIC :: dbcsr_acc_get_ndevices, dbcsr_acc_set_active_device, dbcsr_acc_clear_errors @@ -83,11 +85,16 @@ SUBROUTINE dbcsr_acc_set_active_device(device_id) #if defined (__DBCSR_ACC) INTEGER :: istat -!$OMP PARALLEL DEFAULT(NONE) PRIVATE(istat) SHARED(device_id) - istat = acc_set_active_device_cu(device_id) +!$ IF (0 == omp_get_level()) THEN + istat = 0 +!$OMP PARALLEL DEFAULT(NONE) SHARED(device_id) REDUCTION(MAX:istat) + istat = acc_set_active_device_cu(device_id) +!$OMP END PARALLEL +!$ ELSE + istat = acc_set_active_device_cu(device_id) +!$ END IF IF (istat /= 0) & DBCSR_ABORT("dbcsr_acc_set_active_device: failed") -!$OMP END PARALLEL #else MARK_USED(device_id) diff --git a/src/acc/libsmm_acc/README.md b/src/acc/libsmm_acc/README.md index faa0aab6806..8978689c00b 100644 --- a/src/acc/libsmm_acc/README.md +++ b/src/acc/libsmm_acc/README.md @@ -12,12 +12,10 @@ For a description of the library (some details are outdated, but this neverthele ## Directory Organization -- [`kernels/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/): GPU kernels (CUDA- and HIP-compatible) for matrix-matrix multiplication and python interface to autotuning and predictive code. -- [`notebooks/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/): jupyter notebooks for exploring data generated from autotuning and prediction. +- [`kernels/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/): GPU kernels (CUDA- and HIP-compatible) for matrix-matrix multiplication and Python interface to autotuning code. - `generate_*.py`: utility scripts for `libsmm_acc` compilation - `libsmm_acc*`: libsmm_acc C++ and CUDA / HIP code -- [`parameters/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/parameters/): contains `parameters_GPU.json` files. These are sets of matrix-matrix multiplication parameters for different (m, n, k)-triplets optimized for a given GPU card. You can explore these parameters interactively using the [provided jupyter notebook](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb) -- [`predict/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/): scripts for prediction of optimal parameter sets, see [predictive modeling of kernel parameters](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/README.md) +- [`parameters/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/parameters/): contains `parameters_GPU.json` files. These are sets of matrix-matrix multiplication parameters for different (m, n, k)-triplets optimized for a given GPU card. - [`tune/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/tune/): scripts for autotuning of optimal parameter sets, see [autotuning of kernel parameters](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/tune/README.md) ## Matrix-matrix Multiplication Kernels and Parameters @@ -46,7 +44,7 @@ which take between 3 - 7 **parameters** (see figure at the top): - **w**: input slab width (width of slab `P_A` and `P_B`) - **v**: output slab width (width of slab `P_C`) -The performance of the matrix-matrix multiplication kernels is highly dependent on the choice of algorithm and parameters. For this reason, `libsmm_acc` provides lists of optimal parameters for different GPU cards and different (m, n, k)-triplets. These sets of optimal parameters can be found either through *autotuning* or *predictive modeling*. +The performance of the matrix-matrix multiplication kernels is highly dependent on the choice of algorithm and parameters. For this reason, `libsmm_acc` provides lists of optimal parameters for different GPU cards and different (m, n, k)-triplets. ## Contributing to libsmm_acc @@ -56,19 +54,13 @@ We expect users to contribute to the library by providing new optimized kernels Follow the [autotuning procedure](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/tune/README.md) -#### Predictive modeling of kernel parameters - -Follow the [predictive modeling procedure](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/README.md) - #### Adding a new kernel 1. Choose a kernel `name` 2. Add the kernel's code (must be able to compile by both `nvcc` and `hip`) in file `kernels/smm_acc_dnt_name.h` -3. Add python kernel class inheriting from base class `kernels/smm_acc_dnt_name.py` - -4. Add the new kernel to the `kernel_algorithm` data structure in [`kernels/smm_acc_predict.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_predict.py) +3. Add Python kernel class inheriting from base class `kernels/smm_acc_dnt_name.py` #### Adding support for a new GPU card @@ -85,4 +77,4 @@ Follow the [predictive modeling procedure](https://github.com/cp2k/dbcsr/blob/de } ``` -then add matrix-matrix multiplication parameters for this GPU using *autotuning* and *predictive modeling* +then add matrix-matrix multiplication parameters for this GPU using *autotuning*. diff --git a/src/acc/libsmm_acc/kernels/README.md b/src/acc/libsmm_acc/kernels/README.md index 31e4b81eb61..14a268c3d3d 100644 --- a/src/acc/libsmm_acc/kernels/README.md +++ b/src/acc/libsmm_acc/kernels/README.md @@ -14,8 +14,6 @@ * `smm_acc_dnt_ALGORITHM.h` Batched Multiply Kernel CUDA/HIP code -* [`smm_acc_predict.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_predict.py) Class and helper functions for parameter prediction procedure - * [`smm_acc_transpose.h`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_transpose.h) Transpose CUDA/HIP code ## Batched Multiplication Kernels diff --git a/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h b/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h index 7f70b2835d5..01e71d2d562 100644 --- a/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h +++ b/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h @@ -422,7 +422,7 @@ __global__ void __launch_bounds__(threads, minblocks) smm_acc_dnt_medium(const i } if (need_sync) syncthreads(); - /* Add results from shared memory buffer to global C block. */ + /* Add results from shared memory buffer to global C block. */ #pragma unroll for (int i = tidx; i < mn; i += threads) { atomicAdd(&c_data[srcC + i], buff[i]); diff --git a/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h b/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h index 767c02f4025..51f62b24a64 100644 --- a/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h +++ b/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h @@ -114,7 +114,7 @@ __global__ void __launch_bounds__(threads, minblocks) smm_acc_dnt_small(const in nrun = grouping; if (((bidx + 1) * grouping) > stack_size) nrun = stack_size % grouping; - /* Set the partial sum (tile T) to zero */ + /* Set the partial sum (tile T) to zero */ #pragma unroll for (int i = 0; i < M * N; i++) myc[i] = 0.0; @@ -203,7 +203,7 @@ __global__ void __launch_bounds__(threads, minblocks) smm_acc_dnt_small(const in if (need_sync) syncthreads(); - /* Add results from shared memory buffer to global C block. */ + /* Add results from shared memory buffer to global C block. */ #pragma unroll for (int i = tidx; i < mn; i += threads) atomicAdd(&c_data[srcC + i], buff[i]); } diff --git a/src/acc/libsmm_acc/notebooks/README.md b/src/acc/libsmm_acc/notebooks/README.md deleted file mode 100644 index df0114cf103..00000000000 --- a/src/acc/libsmm_acc/notebooks/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# libsmm_acc Notebooks - -Notebooks for exploring data generated from auto-tuning and prediction. - -**Requirements** -Python version required: python 3.6+ - -Install all python packages required (if you do not want this project's requirements to interfere with your other Python projects, consider doing so in a [virtual environment](https://docs.python.org/3/tutorial/venv.html)), using - -```bash -pip install -r requirements.txt -``` diff --git a/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb b/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb deleted file mode 100644 index e2e971ede58..00000000000 --- a/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb +++ /dev/null @@ -1,279 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# `libcusmm`: explore the space of autotuned parameters" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook can be used to explore the space of autotuned parameters, stored in files named `parameters_GPU.json`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Library imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import json, os" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Choose a GPU" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "GPU = 'P100' # Options: K20X, K40, K80, P100, V100, Mi50, Mi100, Mi250" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "params = '../parameters_' + GPU + '.json' \n", - "assert os.path.exists(params)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Read autotuned parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open(params) as f:\n", - " all_parameters = pd.DataFrame([params for params in json.load(f)])\n", - "autotuned_parameters = all_parameters[all_parameters['source'] == 'autotuned']\n", - "print(\"Reading autotuned data from\", params)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "ordered_columns = ['m', 'n', 'k', 'perf', 'algorithm', 'threads', 'grouping', 'minblocks', 'tile_m', 'tile_n', 'v', 'w']\n", - "autotuned_parameters = autotuned_parameters[ordered_columns]\n", - "print('Autotuned parameters:')\n", - "display(autotuned_parameters)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Description" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('Numer of columns:', len(autotuned_parameters.columns), '\\nNumber of rows:', len(autotuned_parameters.index.values))\n", - "print('\\nColumn names:')\n", - "for c in autotuned_parameters.columns.values: \n", - " print(c)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "autotuned_parameters.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import pandas_profiling \n", - "pandas_profiling.ProfileReport(autotuned_parameters)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Plot performances" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline \n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "autotuned_parameters['mnk'] = autotuned_parameters['m'] * autotuned_parameters['n'] * autotuned_parameters['k']\n", - "plt.semilogx(autotuned_parameters['mnk'], autotuned_parameters['perf'], '.', markersize=3)\n", - "plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')\n", - "plt.ylabel('Performance [Gflops]')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Parameter frequencies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ignore the 'threads' parameter since it has to be adapted to the size of matrix C\n", - "parameter_set = ['algorithm', 'grouping', 'minblocks', 'tile_m', 'tile_n', 'v', 'w']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Most frequent parameter sets" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def get_par_set(algorithm, grouping, minblocks, tile_m, tile_n, v, w):\n", - " par_set= algorithm + '_' + str(int(grouping)) + '_' + str(int(minblocks)) \n", - " if not np.isnan(tile_m):\n", - " par_set += '_' + str(int(tile_m)) + '_' + str(int(tile_n))\n", - " if not np.isnan(v):\n", - " par_set += '_' + str(int(v)) + '_' + str(int(w))\n", - " return par_set\n", - " \n", - "vget = np.vectorize(get_par_set)\n", - "autotuned_parameters['param_set'] = vget(*[a for a in autotuned_parameters[parameter_set].values.transpose()])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "param_set_freq = autotuned_parameters['param_set'].value_counts(dropna=True)\n", - "autotuned_parameters['param_set_freq'] = autotuned_parameters['param_set'].apply(lambda item: param_set_freq[item])\n", - "autotuned_parameters.sort_values(by='param_set_freq', ascending=False, inplace=True)\n", - "autotuned_parameters.iloc[:50,:]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Most frequent parameters (independently of each other)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "most_frequent_values = dict()\n", - "for c in autotuned_parameters.columns.values: \n", - " plt.figure\n", - " plt.hist(autotuned_parameters[c].dropna(), bins=50)\n", - " plt.title(c)\n", - " plt.show()\n", - " if c in parameter_set: \n", - " col = autotuned_parameters[c].dropna().values\n", - " values, counts = np.unique(col, return_counts=True)\n", - " ind_most_freq = np.argmax(counts)\n", - " most_freq_val = values[ind_most_freq]\n", - " most_frequent_values[c] = most_freq_val" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb b/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb deleted file mode 100644 index 7778bd35aef..00000000000 --- a/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb +++ /dev/null @@ -1,607 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "collapsed": true - }, - "source": [ - "# `libcusmm`: Explore the Training Data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook allows you to explore the training data collected from autotuning before proceeding to training." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import re, sys, os, json, random\n", - "import numpy as np\n", - "import pandas as pd\n", - "import dask.dataframe as dd\n", - "from nb_helper import *" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read training data from autotuning folders \n", - "\n", - "Read from files of form `tune_*x*x*/raw_training_data_*x*x*_algo.csv`. \n", - "If you want to read from aggregated Parquet files (recommended), skip to lower" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Path to autotuning data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Provide the path to the autotuning data:\n", - "- You can use the bash cell below to navigate your filetree:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "ls -ad AUTOTUNING_DATA_PATH/tune_*x*x*/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Then, copy what you've replaced `AUTOTUNING_DATA_PATH` with in the Python variable `autotuning_data_path` below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "autotuning_data_path = '' # may not recognize '~', please provide an absolute path:\n", - "check_autotuning_data_path(autotuning_data_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set options\n", - "\n", - "Set the following options appropriately:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "to_read = 100 # How many / which data folders to read. Options: \n", - " # - 'all': reads from all available data folders. \n", - " # Beware, this might result in memory errors if large amounts of data are made available\n", - " # - a number: reads this number of data folders (e.g. 100)\n", - " # - a regex: reads the data folders with matching regex (e.g. tune_4x*x*)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "algorithm = get_algorithm_to_explore('all') # algorithms to explore. Options: all, tiny, small, medium" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get the list of folders to read\n", - "folders_to_read = get_folders_to_read(to_read, autotuning_data_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Read training data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_files_to_read, derived_files_to_read = get_files_to_read(folders_to_read, algorithm)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%%time\n", - "num_f = len(files_to_read)\n", - "data_raw = dd.read_csv(raw_files_to_read, dtype={}).set_index(\"Unnamed: 0\")\n", - "data_derived = dd.read_csv(derived_files_to_read, dtype={}).set_index(\"Unnamed: 0\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# merge the two: " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read training data from Parquet files\n", - "\n", - "Read from files of form `training_data_algorithm.parquet`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Path to autotuning data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Provide the path to the autotuning data:\n", - "- You can use the bash cell below to navigate your filetree:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "ls -ad AUTOTUNING_DATA_PATH/*.parquet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Then, copy what you've replaced `AUTOTUNING_DATA_PATH` with in the Python variable `training_data_path` below:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_data_path = '../tune_dataset_V100/' # may not recognize '~', please provide an absolute path:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "algorithm = \"small\" # algorithm to explore. Options: tiny, small, medium, largeDB1, largeDB2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "parquet_data_file = os.path.join(training_data_path, \"training_data_\" + algorithm + \".parquet\")\n", - "data = dd.read_parquet(parquet_data_file)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data inspection\n", - "\n", - "### Data head" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "for i in range(0, len(data.columns.values), page_width):\n", - " display(data.iloc[:,i:i+page_width].head())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data description" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('Data size :', sys.getsizeof(data)/10**6, 'MB')\n", - "print('Number of columns:', len(data.columns.values))\n", - "print('Number of rows : {:,}'.format(len(data.index)))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#for i in range(0, len(data.columns.values), page_width):\n", - "# display(data.iloc[:,i:i+page_width].describe())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "#print('Number of columns:', len(data.columns), '\\nNumber of rows:', len(data.index), '\\n')\n", - "for col in data.columns: \n", - " print('{:<40} {}'.format(col, data[col].dtype))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Feature categories\n", - "mnk = ['m', 'n', 'k']\n", - "kernel_pars = ['algorithm', 'threads_per_blk', 'grouping', 'minblocks',\n", - " 'tile_m', 'tile_n', 'w', 'v', 'nbytes_smem', 'nbytes_cmem', 'regs_per_thread']\n", - "kernel_pars = list(set(kernel_pars) & set(data.columns.values))\n", - "perf = ['perf (Gflop/s)', 'perf_scaled']\n", - "common = ['Gflops', 'mxnxk', 'size_a', 'size_b', 'size_c', 'nblks', \n", - " 'warps_per_blk', 'nwarps', 'sm_desired', 'nthreads', 'ru_param_stack_unroll_factor']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Features\n", - "\n", - "Features in the left-most column correspond to \"raw\" parameters\n", - "* **green** kernel parameters \n", - "* **grey** GPU card properties (taken from Nvidia/AMD documentation) \n", - "* **pink** autotuning parameters (taken from DBCSR codebase) \n", - "\n", - "Other features correspond to derived parameters, computed from the \"raw\" parameters\n", - "* **yellow** matrix sizes\n", - "* **light grey** launch parameters\n", - "* **blue** and **purple** estimations of resource usages" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![parameters dependency graph](../../../../docs/media/images/libsmm_acc_predictive_modeling_features.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "thresh = 300000 # do not perform very long operations on row counts above this threshold" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "data_to_profile = data\n", - "n_rows_data = len(data)\n", - "if n_rows_data > thresh: # if it is a very large dataframe, perform op on subsampled rows\n", - " data_to_profile = data.sample(frac = thresh / n_rows_data)\n", - "\n", - "import pandas_profiling \n", - "pandas_profiling.ProfileReport(data_to_profile.compute())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data visualization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Get Series from Dask to Pandas\n", - "data_mxnxk = data['mxnxk'].compute()\n", - "data_perf = data['perf (Gflop/s)'].compute()\n", - "data_perf_scaled = data['perf_scaled'].compute()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.semilogx(data_mxnxk, data_perf, '.', markersize=1)\n", - "plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')\n", - "plt.ylabel('Performance [Gflops]')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data visualization (scaled performance)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.plot(data_mxnxk, data_perf_scaled, '.', markersize=1)\n", - "plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')\n", - "plt.ylabel('Performance scaled (overall)')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Performance profile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Choose (m, n, k) triplet\n", - "m_plot, n_plot, k_plot = (4, 4, 4)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_mnk = data[data['m'] == m_plot][ \n", - " data['n'] == n_plot][ \n", - " data['k'] == k_plot].compute()\n", - "data_mnk.sort_values(by='perf (Gflop/s)', ascending=True, inplace=True)\n", - "plt.plot(data_mnk['perf (Gflop/s)'].values)\n", - "plt.xlabel('parameter set')\n", - "plt.ylabel('perf (Gflop/s)')\n", - "plt.title('Performance profile for kernel ' + str(m_plot) + 'x'+ str(n_plot) + 'x'+ str(k_plot))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Histograms with Bokeh\n", - "from bokeh.plotting import figure \n", - "from bokeh.models import ColumnDataSource, HoverTool\n", - "from bokeh.io import output_notebook, show\n", - "output_notebook()\n", - "\n", - "# Create histogram\n", - "num_bins = 100 \n", - "hist, edges = np.histogram(data_mnk['perf (Gflop/s)'], bins=num_bins)\n", - "df_hist = pd.DataFrame({'hist': hist, 'left': edges[:-1], 'right': edges[1:]})\n", - "source = ColumnDataSource(df_hist)\n", - "\n", - "# Create tool \n", - "hover = HoverTool(tooltips=[('# occurences', '@hist'), ('low', '@left'), ('high', '@right')])\n", - "\n", - "# Create the figure\n", - "p = figure(plot_width=800, plot_height=800, title=\"Performance histogram\",\n", - " toolbar_location=None, tools=\"\")\n", - "p.xgrid.grid_line_color = None\n", - "p.xaxis.axis_label = \"Performance (GFlop/s)\"\n", - "p.xaxis.major_label_orientation = 1.2\n", - "p.yaxis.axis_label = \"# occurrences\"\n", - "p.quad(source=source, bottom=0, top='hist', left='left', right='right', fill_color='blue')\n", - "p.add_tools(hover)\n", - "show(p)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Histograms with Bokeh\n", - "from bokeh.plotting import figure \n", - "from bokeh.models import ColumnDataSource, HoverTool\n", - "from bokeh.io import output_notebook, show\n", - "output_notebook()\n", - "\n", - "# Create histogram\n", - "num_bins = 100 \n", - "hist, edges = np.histogram(data_mnk['perf_scaled'], bins=num_bins)\n", - "df_hist = pd.DataFrame({'hist': hist, 'left': edges[:-1], 'right': edges[1:]})\n", - "source = ColumnDataSource(df_hist)\n", - "\n", - "# Create tool \n", - "hover = HoverTool(tooltips=[('# occurences', '@hist'), ('low', '@left'), ('high', '@right')])\n", - "\n", - "# Create the figure\n", - "p = figure(plot_width=800, plot_height=800, title=\"Performance histogram\",\n", - " toolbar_location=None, tools=\"\")\n", - "p.xgrid.grid_line_color = None\n", - "p.xaxis.axis_label = \"Performance scaled\"\n", - "p.xaxis.major_label_orientation = 1.2\n", - "p.yaxis.axis_label = \"# occurrences\"\n", - "p.quad(source=source, bottom=0, top='hist', left='left', right='right', fill_color='blue')\n", - "p.add_tools(hover)\n", - "show(p)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Top slices of perf. distribution\n", - "pars_autotuning_top = {\n", - " 5: list(), \n", - " 2: list(), \n", - " 1: list(), \n", - " 0.5: list()\n", - "}\n", - "max_perf = float(data_mnk['perf (Gflop/s)'].max())\n", - "max_perf_idx = data_mnk['perf (Gflop/s)'].idxmax()\n", - "max_perf_row = data_mnk.loc[max_perf_idx]\n", - "max_perf_cond = max_perf_row[mnk + kernel_pars + ['perf (Gflop/s)']]\n", - "\n", - "print('Maximally performing parameter set:')\n", - "display(max_perf_cond)\n", - "for perc in pars_autotuning_top.keys():\n", - " lim = max_perf - max_perf*perc/100\n", - " blob = data_mnk.loc[data_mnk['perf (Gflop/s)'] >= lim]\n", - " print('\\ntop', perc, '%')\n", - " display(blob[kernel_pars + ['perf (Gflop/s)']].describe())\n", - " pars_autotuning_top[perc].append(blob)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Pair plot " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_pairplot = data\n", - "n_rows_data = len(data)\n", - "if n_rows_data > thresh: # if it is a very large dataframe, perform op on subsampled rows\n", - " data_pairplot = data.sample(frac = thresh / n_rows_data)\n", - "\n", - "sns.pairplot(data_pairplot[mnk + kernel_pars + perf].compute().dropna())" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.5" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/src/acc/libsmm_acc/notebooks/libsmm_acc_predictive_modeling_features.png b/src/acc/libsmm_acc/notebooks/libsmm_acc_predictive_modeling_features.png deleted file mode 120000 index b77db205b4d..00000000000 --- a/src/acc/libsmm_acc/notebooks/libsmm_acc_predictive_modeling_features.png +++ /dev/null @@ -1 +0,0 @@ -../../../../docs/media/images/libsmm_acc_predictive_modeling_features.png \ No newline at end of file diff --git a/src/acc/libsmm_acc/notebooks/nb_helper.py b/src/acc/libsmm_acc/notebooks/nb_helper.py deleted file mode 100644 index 1897b8b0320..00000000000 --- a/src/acc/libsmm_acc/notebooks/nb_helper.py +++ /dev/null @@ -1,111 +0,0 @@ -# -*- coding: utf-8 -*- -#################################################################################################### -# Copyright (C) by the DBCSR developers group - All rights reserved # -# This file is part of the DBCSR library. # -# # -# For information on the license, see the LICENSE file. # -# For further information please visit https://dbcsr.cp2k.org # -# SPDX-License-Identifier: GPL-2.0+ # -#################################################################################################### - -import os -import re - - -# =============================================================================== -# I/O -# kernel_folder_pattern = re.compile('tune_(\d+)x(\d+)x(\d+)$') -kernel_folder_pattern = re.compile(r"tune_(\d+x\d+x\d+)$") -page_width = 5 # columns per output line - - -def check_autotuning_data_path(autotuning_data_path): - # sanity checks - assert os.path.exists(autotuning_data_path), ( - "This path does not exist: " + autotuning_data_path - ) - assert len(os.listdir(autotuning_data_path)) > 0, ( - "No folders found in path: " + autotuning_data_path - ) - # print infos - print( - "Number of tuning data folders found: {}".format( - len(os.listdir(autotuning_data_path)) - ) - ) - - -def get_folders_to_read(to_read, autotuning_data_path): - if to_read == "all": - folders_to_read = [ - os.path.join(autotuning_data_path, f) - for f in os.listdir(autotuning_data_path) - if kernel_folder_pattern.match(f) is not None - ] - elif isinstance(to_read, int): - folders_to_read = [ - os.path.join(autotuning_data_path, f) - for f in os.listdir(autotuning_data_path) - if kernel_folder_pattern.match(f) is not None - ] - folders_to_read = folders_to_read[:to_read] - elif isinstance(to_read, str): - to_read = re.compile(to_read) - folders_to_read = [ - os.path.join(autotuning_data_path, f) - for f in os.listdir(autotuning_data_path) - if to_read.match(f) is not None - ] - else: - raise AssertionError("Cannot recognize option: " + to_read) - - num_folders_to_read = len(folders_to_read) - assert num_folders_to_read > 0 - print("Data folders to be read from (total: {:,})\n".format(num_folders_to_read)) - for f in folders_to_read: - print(f) - - return folders_to_read - - -def get_algorithm_to_explore(algo): - algo_to_read = ( - [algo] if algo != "all" else ["tiny", "small", "medium", "largeDB1", "largeDB2"] - ) - print("Algorithm(s) to explore:") - for a in algo_to_read: - print(a) - - return algo_to_read - - -def get_files_to_read(folders_to_read, algo_to_read): - files_to_read = list() - for i, kernel_folder in enumerate(folders_to_read): - print( - "\nfrom {}, read ({}/{:,})".format( - kernel_folder, i + 1, len(folders_to_read) - ) - ) - - for name_algo in algo_to_read: - mnk_string = kernel_folder_pattern.search(kernel_folder).groups()[0] - raw_file_base = "raw_training_data_" + mnk_string + "_" + name_algo + ".csv" - raw_file = os.path.join(kernel_folder, raw_file_base) - derived_file_base = "training_data_" + mnk_string + "_" + name_algo + ".csv" - derived_file = os.path.join(kernel_folder, derived_file_base) - - if os.path.exists(raw_file) and os.path.exists(derived_file): - # Read raw parameters file - files_to_read.append(raw_file) - - # Read derived parameters file - files_to_read.append(derived_file) - - else: - if not os.path.exists(raw_file): - print("\t...{:50} no file".format(raw_file_base)) - if not os.path.exists(derived_file): - print("\t...{:50} no file".format(derived_file_base)) - - return files_to_read diff --git a/src/acc/libsmm_acc/notebooks/requirements.txt b/src/acc/libsmm_acc/notebooks/requirements.txt deleted file mode 100644 index f36ef7a07a0..00000000000 --- a/src/acc/libsmm_acc/notebooks/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -bokeh==1.0.4 -matplotlib==3.0.2 -numpy==1.22.0 -pandas==0.23.4 -pandas-profiling==1.4.1 -seaborn==0.9.0 diff --git a/src/acc/libsmm_acc/predict/README.md b/src/acc/libsmm_acc/predict/README.md deleted file mode 100644 index ddb967a2679..00000000000 --- a/src/acc/libsmm_acc/predict/README.md +++ /dev/null @@ -1,150 +0,0 @@ -# Training Procedure for Predictive Modeling of Optimal Parameters in `libsmm_acc` - -The performance of the matrix-matrix multiplication kernels is highly dependent on the choice of algorithm and parameters, this is why [*autotuning*](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/README.md) is used to find optimal kernel parameters. - -However, the auto-tuning procedure is expensive, and the space of (m,n,k)-triplets to explore is large. The following predictive modeling procedure is set up to predict optimal parameters for (m,n,k)-triplets that have not been auto-tuned from the data gathered from auto-tuning other (m,n,k)-triplets. - ---- - -### Requirements - -Python version required: `python 3.6+` - -Install all python packages required (if you do not want this project's requirements to interfere with your other Python projects, consider doing so in a [virtual environment](https://docs.python.org/3/tutorial/venv.html)), using - -```bash -pip install -r requirements.txt -``` - ---- - -### Predictive parameters - -The input features for the predictive models can be 'raw' parameters, or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations). - ---- - -### Predictive modeling procedure - -#### 1. Get the data - -Get the data to be used for training, either by downloading data from the [dedicated repository](https://github.com/cp2k/dbcsr-data), or by auto-tuning new kernels yourself and combining them with pre-existing data. - -##### 1.a Download pre-collected data from dedicated repository - -- Download data from the dedicated repository: - - ```bash - wget https://github.com/cp2k/dbcsr-data/blob/master/GPU/raw_training_data_ALGORITHM.csv # for ALGORITHM = tiny, small, medium, largeDB1, largeDB2 - ``` - -- Compute derived parameters from raw parameters and create a record of baseline and maximum performances: run [`prepare_training_data.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/prepare_training_data.py), providing the CUDA/HIP architecture number and the location of the downloaded data: - - ```bash - ./prepare_training_data.py # –arch 60 --folder /scratch/autotuning_dataset, e.g. - ``` - -##### 1.b (optional) Aquire data from auto-tuning - -- We would appreciate if you would upload the data resulting from your auto-tuning procedure to the [dedicated repository](https://github.com/cp2k/dbcsr-data). For this, please take note, at this stage, of the [information required to upload your data](https://github.com/cp2k/dbcsr-data/blob/master/git-commit.template). - -- If you're auto-tuning data for a new GPU, make sure that the GPU's compute architecture properties are given in the file [`kernels/gpu_properties.json`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/gpu_properties.json). If not, please add them. - -- Follow the [instructions for auto-tuning](tune.md). - -- If all went well, you now have directories named `tune_mxnxk` containing log files in which parameter sets and their corresponding measured performances are recorded. - -- Collect the information in all the `tune_mxnxk` directories into CSV files: run [`predict_collect.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/predict_collect.py), providing the location of the auto-tuning data: - - ```bash - ./predict_collect.py # --folder /scratch/autotuning_dataset, e.g. - ``` - -You should now have 5 CSV files containing raw data (`raw_training_data_ALGORITHM.csv`, for `ALGORITHM = tiny, small, medium, largeDB1, largeDB2`) - -#### 2. Prepare the data for predictive modeling - -A few steps are needed to make the data ready for training: - -- Record maximum and baseline performances of (m,n,k)-triplets in JSON files -- Compute derived training data and write it to a CSV file -- Compress training data files from CSV to Parquet files - -```bash -./prepare_training_data.py # --folder /scratch/autotuning_dataset -a 60 -j12, e.g. to run with 12 threads -``` - -The data preparation is relatively computationally expensive, especially for large data sets. -A good way of running it, is to - -1. Compute just the maximum and baseline parameters for each algorithm separately (`-l ALGORITHM --skip_derived_data=True`), adjusting the `-j` parameter so it runs fast enough, while not running into "out-of-memory"-errors -2. Run again with `--skip_derived_data=True` to create the files that aggregate maximum and baseline performances for all algorithms. -3. Compute derived data records for each algorithm separately (`-l ALGORITHM`), adjusting the `-j` option. -4. Run the script again without specifying the algorithm nor skipping the derived data to make sure all necessary files have been generated. - -##### At the end, you should end up with the following files: - -- `raw_training_data_ALGORITHM.csv` (containing all *raw* parameters for training a model for algorithm ALGORITHM, obtained in step 1) -- `training_data_ALGORITHM.csv` (containing all *derived* parameters for training a model for algorithm ALGORITHM) -- `training_data_ALGORITHM.parquet` (containing all *raw* and *derived* parameters for training a model for algorithm ALGORITHM in Parquet files, convenient for reading in parallel using Dask) -- `baseline_performances_ALGORITHM.json` and `baseline_performances_by_algo.json` (containing, for each (m, n, k)-triplet in the training data, its baseline performance, i.e. its performance were it to be run with a set of parameters that are an expert's "best guess"). Additionally, the baseline performances are plotted in `baseline_performances.svg`. -- `maximum_performances_ALGORITHM.json`, `max_performances_by_algo.json` and `max_performances.json` (containing, for each (m, n, k)-triplet, its maximum performance). Additionally, the maximum performances are plotted in `maximum_performances.svg`. - -#### 3. (optional) Explore the data - -Explore the data interactively using the [provided Jupyter notebook](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb). - -#### 4. Train - -For each algorithm, build a predictive model using decision trees and feature selection based on the features' permutation importance. - -```bash -./predict_train.py # --algo medium --folder /scratch/autotuning_dataset, e.g. -``` - -Use the command-line parameters `--folder` and `--destination_folder` to choose the folder from which data is read, as well as the folder to which models, logs, etc. are written. -Repeat this step for all algorithms. -This may take several hours. For example, training algorithm 'medium' for the P100 took 11 hours on a single Greina (CSCS) node. -Moreover, depending on the size of the training data, large amounts of memory may be needed. For example, training algorithm 'medium' for the P100 was run on a 192 GB node. - -#### 5. Generate optimal parameters - -Given predictive models (in the form of serialized [scikit-learn](https://scikit-learn.org/) model objects) for all unseen (m,n,k)s, generate or update a file of optimal parameters - -```bash -./predict_genpars.py -c 5000 \ # chunk size - -j 12 \ # 12 threads - --largeDB2 /scratch/largeDB2/feature_tree_refit.p \ # path to models - --largeDB1 /scratch/largeDB1/feature_tree_refit.p \ - --medium /scratch/medium/feature_tree_refit.p \ - --small /scratch/small/feature_tree_refit.p \ - --tiny /scratch/tiny/feature_tree_refit.p -``` - -This may take several hours. For example, generating parameters for the P100 took 8 hours on a single Piz Daint (CSCS) node. For this reason, intermediate results are stored in JSON files in a folder `predict_genpars_ckpt`. Once this script has finished running, and you've successfully obtained a new `parameters_GPU.json` file, you may delete the checkpoint folder `predict_genpars_ckpt`. - -#### 6. Evaluate the predicted parameters - -```bash -./predict_evaluate.py -f libsmm_acc_predicted.out -n libsmm_acc_baseline.out -``` - -#### 7. Contribute your new parameters and data - -##### Contribute training data - -See [instructions](https://github.com/cp2k/dbcsr-data#contributing) in our [dedicated repository](https://github.com/cp2k/dbcsr-data) - -##### Contribute predicted parameters - -Submit a pull request updating the `parameters_GPU.json` file in question. - ---- - -### Contributing to the training procedure - -#### Adding a new predictive feature - -- Choose the new feature's name, "`NAME`" -- Add the feature as a method of `class PredictiveParameters`, named `get_NAME` -- Add the derived feature to the data structure `derived_parameters` in [`kernels/smm_acc_predict.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_predict.py) diff --git a/src/acc/libsmm_acc/predict/predict_collect.py b/src/acc/libsmm_acc/predict/predict_collect.py deleted file mode 100755 index ab41ebe1de2..00000000000 --- a/src/acc/libsmm_acc/predict/predict_collect.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -#################################################################################################### -# Copyright (C) by the DBCSR developers group - All rights reserved # -# This file is part of the DBCSR library. # -# # -# For information on the license, see the LICENSE file. # -# For further information please visit https://dbcsr.cp2k.org # -# SPDX-License-Identifier: GPL-2.0+ # -#################################################################################################### - -import sys -import os -import re -import glob -import argparse -import pandas as pd - -sys.path.append("../") - -from kernels.smm_acc import to_string, kernel_algorithm, parameter_types # noqa: E402 - - -# =============================================================================== -def main(tunedir): - """ - Once autotuning of new kernels has been run, - - collect the parameter information and performance from log files, - - dump them to CSV files for data analysis and training of a predictive model - """ - # =============================================================================== - # Check for old data files first - for algorithm in kernel_algorithm.keys(): - training_data_file = os.path.join(tunedir, f"raw_training_data_{algorithm}.csv") - if os.path.exists(training_data_file): - print( - f"WARNING: Found old data file {training_data_file}, re(move) it first ... exiting" - ) - sys.exit(1) - - # Find all the 'tune_MxNxK' folders - kernel_folder_pattern = re.compile(r"tune_(\d+)x(\d+)x(\d+)$") - kernel_folders = [ - os.path.join(tunedir, ak) - for ak in os.listdir(tunedir) - if kernel_folder_pattern.match(ak) is not None - ] - n_kernels = len(kernel_folders) - assert n_kernels > 0, ( - "Found no kernel folders of format" - + str(kernel_folder_pattern) - + " in folder " - + tunedir - ) - print(f"Found {n_kernels} kernel folders") - - # Collect information and write to csv - collect_training_data(kernel_folders, kernel_folder_pattern) - - # Print commands to merge CSVs into one big CSV for training data - merge_data_files(tunedir) - - -# =============================================================================== -# Helper variables and functions (formatting & writing) -autotuning_line = re.compile( - r"OK Kernel_dnt_(\w+) m (\d+)\s+n (\d+)\s+k (\d+)\s+" - + r"(?:tile_m (\d+)\s+tile_n (\d+)\s+(?:w (\d+)\s+v (\d+)\s+)?)?" - + r"threads (\d+)\s+grouping (\d+)\s+minblocks (\d+)\s+GFlop/s (\d+(?:\.\d+)?)" -) - - -def read_log_file(log_folder, m, n, k): - """ - Given a folder of kernel autotuning, read and parse the autotuning information in the log file - and return it in the form of a pandas Dataframe. - :param log_folder: folder of kernel autotuning - :return: pandas Dataframe containing autotuning information - """ - # Find log files in the log folder - log_files = [f for f in os.listdir(log_folder) if f[-4:] == ".log"] - assert len(log_files) > 0 - log_files = sorted(log_files) - - # Parse the log files and collect data - data = list() - for log_file in log_files: - print(f"Processing log file {log_file}") - with open(os.path.join(log_folder, log_file), "r") as f: - log_file_content = f.read().splitlines() - - for line in log_file_content: - if "OK" in line: # this line contains autotuning data - # Parse the line - match = autotuning_line.match(line) - assert match is not None, "Found null match: " + line - - # Get algorithm, parameters, and performance - data.append( - { - "m": m, - "n": n, - "k": k, - "algorithm": match.group(1), - "threads": match.group(9), - "grouping": match.group(10), - "minblocks": match.group(11), - "tile_m": ( - match.group(5) if match.group(5) is not None else None - ), - "tile_n": ( - match.group(6) if match.group(6) is not None else None - ), - "w": match.group(7) if match.group(7) is not None else None, - "v": match.group(8) if match.group(8) is not None else None, - "perf (Gflop/s)": match.group(12), - } - ) - - print(f"{len(data)} autotuning lines found") - - # Merge dictionaries into a pandas dataframe - dataframe = pd.DataFrame(data) - for col in dataframe.columns: - dataframe[col] = dataframe[col].astype(parameter_types[col], errors="ignore") - - return dataframe - - -def collect_training_data(kernel_folders, kernel_folder_pattern): - """ - Collect training data from log files resulting of autotuning - """ - - # =============================================================================== - # For each folder: - n_kernels = len(kernel_folders) - for i, kernel_folder in enumerate(kernel_folders): - print(f"\nProcess folder {kernel_folder} ({i+1}/{n_kernels})") - - # Find (m, n, k) - # Each folder contains data for just one (m, n, k) but potentially mutliple algorithms - match = kernel_folder_pattern.search(kernel_folder).groups() - m = int(match[0]) - n = int(match[1]) - k = int(match[2]) - - # =============================================================================== - # Collect info from log files - log_files = [f for f in os.listdir(kernel_folder) if f[-4:] == ".log"] - if len(log_files) > 0: - data = read_log_file(kernel_folder, m, n, k) - else: - print(f"No log files found in folder {kernel_folder} ... skipping") - continue - - # =============================================================================== - # Write parameters to CSV - for name_algo, kernel_algo in kernel_algorithm.items(): - # if applicable to this mnk - if name_algo in data["algorithm"].values: - # Does collected csv file exist already? - raw_parameters_file_name = os.path.join( - kernel_folder, - "raw_training_data_" - + to_string(m, n, k) - + "_" - + name_algo - + ".csv", - ) - - if os.path.exists(raw_parameters_file_name): - print(f"Found csv file {raw_parameters_file_name} ... skipping") - else: - # Get the data corresponding to this algorithm - data_algo = data[data["algorithm"] == name_algo] - # Write raw parameters - pars_to_get = kernel_algo.launch_parameters + ["perf (Gflop/s)"] - data_algo[pars_to_get].to_csv(raw_parameters_file_name, index=False) - print("Wrote", raw_parameters_file_name) - - -# =============================================================================== -def merge_data_files(tunedir): - """ - Merge CSV files - """ - for algorithm in kernel_algorithm.keys(): - training_data_file = os.path.join( - tunedir, "raw_training_data_{algorithm}.csv".format(algorithm=algorithm) - ) - - if os.path.exists(training_data_file): - print(f"\nFound {training_data_file} ... skipping") - os.rename(training_data_file, f"{training_data_file}.bak") - - print(f"\nMerging partial CSV files into {training_data_file} ... ") - - filenames_pattern = os.path.join( - tunedir, - "tune_*/raw_training_data_*_{algorithm}.csv".format(algorithm=algorithm), - ) - print("Merging all files with pattern:", filenames_pattern) - filenames = glob.glob(filenames_pattern) - if len(filenames) == 0: - print("Found no files matching this pattern ... skipping") - - else: - print(f"Found {len(filenames)} files matching this pattern") - - with open(training_data_file, "w") as out: - # Write the first file, including its header - fn_1 = filenames.pop(0) - with open(fn_1) as f: - header_line_ref = next(f) # read header line - out.write(header_line_ref) # write header line - out.write(f.read()) # write the rest of the file - # Write the rest of the files, skipping the header line each time - for i, fn in enumerate(filenames): - print("writing from {} ({}/{})".format(fn, i + 1, len(filenames))) - with open(fn) as f: - header_line = next(f) # skip header line - assert header_line == header_line_ref, ( - 'Cannot merge file "' - + fn - + '", because its header line:\n' - + header_line - + 'is different from the header line of file "' - + fn_1 - + '":\n' - + header_line_ref - ) - out.write(f.read()) - - print("Wrote to {}".format(training_data_file)) - - -# =============================================================================== -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - Collect matrix-matrix multiplication parameters and performances measured during autotuning. For that, - parse the log files created by the autotuning and record parameter sets and their performances to CSV files. - - This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters. - For more details, see README.md. - """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "-f", - "--folder", - metavar="FOLDER", - type=str, - default=".", - help="Folder in which the folders tune_*x*x*x/ are to be found", - ) - parser.add_argument( - "-a", - "--arch", - metavar="ARCHITECTURE_NUMBER", - type=int, - default=80, - help="GPU architecture code. Options: sm_35, sm_37, sm_60, sm_70, sm_80, gfx906", - ) - - args = parser.parse_args() - main(args.folder) diff --git a/src/acc/libsmm_acc/predict/predict_evaluate.py b/src/acc/libsmm_acc/predict/predict_evaluate.py deleted file mode 100755 index a5b3de7f4af..00000000000 --- a/src/acc/libsmm_acc/predict/predict_evaluate.py +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -#################################################################################################### -# Copyright (C) by the DBCSR developers group - All rights reserved # -# This file is part of the DBCSR library. # -# # -# For information on the license, see the LICENSE file. # -# For further information please visit https://dbcsr.cp2k.org # -# SPDX-License-Identifier: GPL-2.0+ # -#################################################################################################### - -import re -import numpy as np -import argparse -from predict_helpers import ( - performance_gain, - relative_performance_gain, - plot_absolute_performance_gain, - plot_relative_performance_gain, - plot_performance_gains, -) - - -# =============================================================================== -def main(file, file_baseline): - """ - Given a file containing the results of the LBSMM_ACC performance test, perform evaluation of the predictive model. - """ - # =============================================================================== - # Read optimal-parameter-prediction result file - with open(file) as f: - result_file = f.read().splitlines() - results_predictive_model = read_result_file(result_file) - - # Read baseline result file - with open(file_baseline) as f: - result_file = f.read().splitlines() - results_baseline = read_result_file(result_file) - - # =============================================================================== - # Performance comparison quantities - improved_over_baseline = dict( - zip( - sorted(results_predictive_model.keys()), - [ - results_predictive_model[(m, n, k)] > results_baseline[(m, n, k)] - for m, n, k in sorted(results_predictive_model.keys()) - ], - ) - ) - perf_gain_over_baseline = performance_gain( - results_baseline, results_predictive_model - ) - rel_perf_gain_over_baseline = relative_performance_gain( - results_baseline, results_predictive_model - ) - - # =============================================================================== - # Print results - header = "m, n, k: baseline perf. [Gflops], predictive model perf. [Gflops], performance gain [? ]" - print(header) - line = ( - "{m:>2}, {n:>2}, {k:>2}: {baseline_perf:>7.2f}, {predictive_model_perf:>7.2f}, " - + "{performance_gain:>7.2f}, {better}" - ) - for m, n, k in sorted(results_predictive_model.keys()): - print( - line.format( - m=m, - n=n, - k=k, - baseline_perf=results_baseline[(m, n, k)], - predictive_model_perf=results_predictive_model[(m, n, k)], - performance_gain=perf_gain_over_baseline[(m, n, k)], - better=improved_over_baseline[(m, n, k)], - ) - ) - - print( - "\nKernel performances improved by predictive model:", - list(improved_over_baseline.values()).count(True), - "/", - len(results_predictive_model.keys()), - ) - perf_gain_improved = [pg for pg in perf_gain_over_baseline.values() if pg > 0] - print( - "Mean performance gain amongst improved kernels: {:.2f} Gflops".format( - np.mean(perf_gain_improved) - ) - ) - - print( - "\nKernel performances reduced by predictive model:", - list(improved_over_baseline.values()).count(False), - "/", - len(results_predictive_model.keys()), - ) - perf_gain_deteriorated = [pg for pg in perf_gain_over_baseline.values() if pg < 0] - print( - "Mean performance loss amongst deteriorated kernels: {:.2f} Gflops".format( - np.mean(perf_gain_deteriorated) - ) - ) - - print( - "\nMean performance gain overall: {:.2f} Gflops".format( - np.mean(list(perf_gain_over_baseline.values())) - ) - ) - - # =============================================================================== - # Plot results (testing set: predictive modelling VS naïve) - plot_absolute_performance_gain( - perf_gain_over_baseline, "non-autotuned", "baseline", "predictive model" - ) - plot_relative_performance_gain( - rel_perf_gain_over_baseline, "non-autotuned", "baseline", "predictive model" - ) - plot_performance_gains( - results_predictive_model, - results_baseline, - "non-autotuned", - "baseline", - "predictive model", - ) - - -# =============================================================================== -def read_result_file(file): - results = dict() - result_line = re.compile(r"OK (\d+) x (\d+) x (\d+) GFlop/s (\d+(?:\.\d+)?)") - for line in file: - match = result_line.match(line) - if match is not None: - m = int(match.group(1)) - n = int(match.group(2)) - k = int(match.group(3)) - perf = float(match.group(4)) - results[(m, n, k)] = perf - - return results - - -# =============================================================================== -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - Given a file containing the results of the LIBSMM_ACC performance test, perform evaluation of the predictive - model. - - This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters. - For more details, see README.md. - """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "-f", - "--file", - metavar="filename.out", - type=str, - default="", - help="Result file to evaluate. Output of tests/libsmm_acc_timer_multiply.cpp", - ) - parser.add_argument( - "-n", - "--file_baseline", - metavar="filename.out", - type=str, - default="", - help="Baseline performance file to compare against.", - ) - - args = parser.parse_args() - main(args.file, args.file_baseline) diff --git a/src/acc/libsmm_acc/predict/predict_genpars.py b/src/acc/libsmm_acc/predict/predict_genpars.py deleted file mode 100755 index 61f377053ce..00000000000 --- a/src/acc/libsmm_acc/predict/predict_genpars.py +++ /dev/null @@ -1,406 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -#################################################################################################### -# Copyright (C) by the DBCSR developers group - All rights reserved # -# This file is part of the DBCSR library. # -# # -# For information on the license, see the LICENSE file. # -# For further information please visit https://dbcsr.cp2k.org # -# SPDX-License-Identifier: GPL-2.0+ # -#################################################################################################### - -import gc -import os -import sys -import json -import pandas as pd -from itertools import product -import argparse -from joblib import Parallel, delayed -from predict_helpers import safe_pickle_load -from warnings import simplefilter - -simplefilter(action="ignore", category=UserWarning) - -sys.path.append("../") -from kernels.smm_acc import to_tuple, to_string # noqa: E402 -from kernels.smm_acc_predict import ( # noqa: E402 - gpu_architectures, - kernel_algorithm, - params_dict_to_kernel, - PredictiveParameters, -) - -# The joblib backend spawns additional processes, which do not inherit the warning filters applied using warnings.filterwarnings -os.environ["PYTHONWARNINGS"] = "ignore::UserWarning" - - -# =============================================================================== -def main(params, njobs, baseline, paths_to_models, chunk_size): - """ - Update parameter file with new optimal parameter predictions given newly trained decision trees - """ - # =============================================================================== - # Load GPU and autotuning properties - assert ( - os.path.basename(params) in gpu_architectures.keys() - ), "Cannot find compute version for file " + str(params) - arch_code = gpu_architectures[os.path.basename(params)] - with open("../kernels/gpu_properties.json") as f: - gpu_properties = json.load(f)[arch_code] - with open("../kernels/autotuning_properties.json") as f: - autotuning_properties = json.load(f) - - # Load autotuned kernel parameters - with open(params) as f: - all_kernels = [params_dict_to_kernel(**params) for params in json.load(f)] - print("libsmm_acc: Found %d existing parameter sets." % len(all_kernels)) - autotuned_mnks = [(k.m, k.n, k.k) for k in all_kernels if k.autotuned] - autotuned_kernels_ = [k for k in all_kernels if k.autotuned] - autotuned_kernels = dict(zip(autotuned_mnks, autotuned_kernels_)) - - # =============================================================================== - # Construct the list of (m,n,k)-triplets for which parameter sets should be made available to libcusmm - mnks = combinations(list(range(4, 46))) - mnks = set.union(set(mnks), set(autotuned_kernels.keys())) - - # =============================================================================== - # Compute parameter sets - mnks_to_predict = list() - kernels_to_print = dict() - for m, n, k in mnks: - if (m, n, k) in autotuned_kernels.keys(): - kernels_to_print[(m, n, k)] = autotuned_kernels[(m, n, k)] - else: - mnks_to_predict.append((m, n, k)) - - if baseline: - kernels = get_baseline_kernels( - mnks_to_predict, gpu_properties, autotuning_properties - ) - else: - kernels = get_optimal_kernels( - mnks_to_predict, - njobs, - chunk_size, - paths_to_models, - gpu_properties, - autotuning_properties, - 1, - ) - - kernels_to_print.update(kernels) - - # =============================================================================== - # Write to file - with open(params, "w") as f: - s = json.dumps( - [ - kernels_to_print[kernel].as_dict_for_parameters_json - for kernel in sorted(kernels_to_print.keys()) - ] - ) - s = s.replace("}, ", "},\n") - s = s.replace("[", "[\n") - s = s.replace("]", "\n]") - f.write(s) - print("Wrote new predicted parameters to file", params) - - -# =============================================================================== -# Helpers -def combinations(sizes): - return list(product(sizes, sizes, sizes)) - - -def remove_empty_entries(ld): - """ - Given a list of dictionaries "ld", remove its list elements that are empty dicts - """ - return [d for d in ld if d] # empty dictionaries evaluate to False - - -def find_optimal_kernel( - mnk, algo, tree, tree_features, gpu_properties, autotuning_properties -): - """ - Find the optimal kernel parameter set for a given (m, n, k) and a given algorithm - :return: optimal_kernels: dictionary, keys: (m, n, k), values: Kernel object describing best parameters - """ - - # Get parameter space for this (m, n, k) and this algorithm - m, n, k = mnk - parameter_space_ = kernel_algorithm[algo].promising_parameters( - m, n, k, gpu_properties, autotuning_properties - ) - parameter_space = pd.DataFrame(parameter_space_) - del parameter_space_ - parameter_space["algorithm"] = [algo] * len( - parameter_space.index - ) # Add "algorithm" column - if len(parameter_space.index) == 0: - optimal_kernels = dict() - - else: - # Get predictor features from raw parameters - parameter_sets = PredictiveParameters( - parameter_space, gpu_properties, autotuning_properties, None - ) - predictors = parameter_sets.get_features(tree_features) - if algo == "medium": - predictors = predictors.rename( - columns=dict( - zip( - predictors.columns, - [ - "f{}".format(i) - for i in range(0, len(predictors.columns) + 1) - ], - ) - ) - ) - - # Predict performances - performances_scaled = tree.predict(predictors) - del predictors - parameter_performances = parameter_sets.params - del parameter_sets - parameter_performances["perf"] = performances_scaled - del performances_scaled - - # Pick optimal kernel - optimal_kernel = max( - parameter_performances.to_dict("records"), key=lambda x: x["perf"] - ) - del parameter_performances - optimal_kernels = dict() - optimal_kernels[(m, n, k)] = params_dict_to_kernel( - **optimal_kernel, source="predicted" - ) - - return optimal_kernels - - -def get_optimal_kernels( - mnks_to_predict, - njobs, - chunk_size, - paths_to_models, - gpu_properties, - autotuning_properties, - top_k, -): - # optimal_kernels_list is a list of dictionaries - # - keys: (m, n, k), - # - values: Kernel object describing best parameters - # - number of elements in each dictionary = top_k - # each element of the list corresponds to the search of optimal kernels for a given mnk and a given algorithm - - print("Getting optimal kernels") - - # =============================================================================== - # Load predictive trees and feature list - tree = dict() - kernel_to_investigate = dict() - for algo in kernel_algorithm.keys(): - path_to_model = paths_to_models[algo] - if path_to_model is not None: - print( - "Algorithm: {:<8}, loading model from: {}".format(algo, path_to_model) - ) - tree[algo] = dict() - tree[algo]["file"] = path_to_model - features, tree[algo]["tree"] = safe_pickle_load(tree[algo]["file"]) - tree[algo]["features"] = features.tolist() - kernel_to_investigate[algo] = kernel_algorithm[algo] - else: - print("Algorithm: {:<8}, no model found.".format(algo)) - - if len(kernel_to_investigate) == 0: - print("No model found. Specify path to predictive models using ") - sys.exit(1) - - # =============================================================================== - # Get mnks_by_algo to compute: - mnks_by_algo = list(product(mnks_to_predict, kernel_to_investigate.keys())) - num_mnks_by_algo = len(mnks_by_algo) - optimal_kernels_list = list() - ckpt_folder_name = "predict_genpars_ckpt" - - if not os.path.exists(ckpt_folder_name): - os.mkdir(ckpt_folder_name) - print("Caching intermediate results to:", ckpt_folder_name) - - for i in range(0, num_mnks_by_algo, chunk_size): - # Chunk up tasks - start_chunk = i - end_chunk = int(min(start_chunk + chunk_size, num_mnks_by_algo)) - print(f"Completed {i} tasks out of {num_mnks_by_algo}") - - # Create checkpoint file or load checkpointed data from it - checkpoint_file_name = os.path.join( - ckpt_folder_name, f"chunk_{start_chunk}-{end_chunk - 1}.json" - ) - - if os.path.exists(checkpoint_file_name): - with open(checkpoint_file_name, "r") as f: - optimal_kernels_list__ = json.load(f) - optimal_kernels_list_ = list() - for i, optker in enumerate(optimal_kernels_list__): - optimal_kernels_list_.append({}) - for k, v in optker.items(): - algo = v.pop("algorithm") - optimal_kernels_list_[i][to_tuple(k)] = kernel_algorithm[algo]( - **v - ) - print(f"Read chunk {start_chunk}-{end_chunk - 1}\n") - - else: - if njobs == 1: - j = i - optimal_kernels_list_ = list() - # Ignore joblib and run serially: - for mnk, algo in mnks_by_algo[start_chunk:end_chunk]: - j += 1 - gc.collect() - print( - f"{j:6d} of {num_mnks_by_algo}: Find optimal kernels for mnk = {mnk} algo = {algo}" - ) - optker = find_optimal_kernel( - mnk, - algo, - tree[algo]["tree"], - tree[algo]["features"], - gpu_properties, - autotuning_properties, - ) - if optker: - optimal_kernels_list_.append(optker) - - else: - # Run prediction tasks in parallel with joblib - optimal_kernels_list_ = Parallel(n_jobs=njobs, verbose=2)( - delayed(find_optimal_kernel, check_pickle=True)( - mnk, - algo, - tree[algo]["tree"], - tree[algo]["features"], - gpu_properties, - autotuning_properties, - ) - for mnk, algo in mnks_by_algo[start_chunk:end_chunk] - ) - optimal_kernels_list_ = remove_empty_entries(optimal_kernels_list_) - - with open(checkpoint_file_name, "w") as f: - optimal_kernels_list__ = list() - for i, optker in enumerate(optimal_kernels_list_): - optimal_kernels_list__.append({}) - for k, v in optker.items(): - optimal_kernels_list__[i][to_string(k)] = v.as_dict - json.dump(optimal_kernels_list__, f) - print(f"Checkpoint file {checkpoint_file_name} written") - - optimal_kernels_list += optimal_kernels_list_ - - print("Finished gathering candidates for optimal parameter space") - - # Group optimal kernel candidates by (m,n,k) in a dictionary - optimal_kernels_mnk_algo = dict() - for optimal_kernel_mnk in optimal_kernels_list: - for mnk, kernels_mnk in optimal_kernel_mnk.items(): - m, n, k = mnk - if (m, n, k) in optimal_kernels_mnk_algo.keys(): - optimal_kernels_mnk_algo[(m, n, k)].append(kernels_mnk) - else: - optimal_kernels_mnk_algo[(m, n, k)] = [kernels_mnk] - - # Find optimal kernel per mnk among the different algorithm possibilities - optimal_kernels = dict() - for mnk, candidate_kernels in optimal_kernels_mnk_algo.items(): - m, n, k = mnk - optimal_kernel_mnk = sorted( - candidate_kernels, key=lambda x: x.perf, reverse=True - )[:top_k] - optimal_kernels[(m, n, k)] = optimal_kernel_mnk[0] - - return optimal_kernels - - -def get_baseline_kernels(mnks_to_predict, gpu_propertes, autotuning_properties): - print("Getting baseline kernels") - baseline_algorithm = "medium" - baseline_kernels = list() - for m, n, k in mnks_to_predict: - baseline_kernels[(m, n, k)] = kernel_algorithm[baseline_algorithm].baseline( - m, n, k, gpu_propertes, autotuning_properties - ) - - return baseline_kernels - - -# =============================================================================== -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - Update parameter file with new optimal parameter predictions given newly trained decision trees. - - This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters. - For more details, see README.md. - """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "-p", - "--params", - metavar="parameters_GPU.json", - default="../parameters/parameters_A100.json", - help="Parameter file to read and update with predictions", - ) - parser.add_argument( - "-j", "--njobs", type=int, default=-1, help="Number of joblib jobs" - ) - parser.add_argument( - "--baseline", - default=False, - help="Generate a parameter file corresponding to the baseline of a predictive model", - ) - parser.add_argument( - "--tiny", - default=None, - help="Path to model trained for algorithm 'tiny'. If not given, ignore this algorithm.", - ) - parser.add_argument( - "--small", - default=None, - help="Path to model trained for algorithm 'small'. If not given, ignore this algorithm.", - ) - parser.add_argument( - "--medium", - default=None, - help="Path to model trained for algorithm 'medium'. If not given, ignore this algorithm.", - ) - parser.add_argument( - "--largeDB1", - default=None, - help="Path to model trained for algorithm 'largeDB1'. If not given, ignore this algorithm.", - ) - parser.add_argument( - "--largeDB2", - default=None, - help="Path to model trained for algorithm 'largeDB2'. If not given, ignore this algorithm.", - ) - parser.add_argument( - "-c", - "--chunk_size", - type=int, - default=5000, - help="Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number", - ) - - args = parser.parse_args() - paths_to_models = dict() - for algo in kernel_algorithm.keys(): - paths_to_models[algo] = args.__dict__[algo] - main(args.params, args.njobs, args.baseline, paths_to_models, args.chunk_size) diff --git a/src/acc/libsmm_acc/predict/predict_helpers.py b/src/acc/libsmm_acc/predict/predict_helpers.py deleted file mode 100644 index 890d793f003..00000000000 --- a/src/acc/libsmm_acc/predict/predict_helpers.py +++ /dev/null @@ -1,301 +0,0 @@ -# -*- coding: utf-8 -*- -#################################################################################################### -# Copyright (C) by the DBCSR developers group - All rights reserved # -# This file is part of the DBCSR library. # -# # -# For information on the license, see the LICENSE file. # -# For further information please visit https://dbcsr.cp2k.org # -# SPDX-License-Identifier: GPL-2.0+ # -#################################################################################################### - -import sys -import os -import pickle -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -sys.path.append("../") -from kernels.smm_acc import to_string # noqa: E402 - - -# =============================================================================== -# I/O helpers -def safe_pickle(data, file): - """ - Pickle big files safely by processing them in chunks. - This wrapper is a workaround for a bug on OSX (https://bugs.python.org/issue24658) - - :param data: data to be pickled - :param file: file to pickle it into - """ - max_bytes = 2**31 - 1 # Maximum number of bytes to write in one chunk - pickle_out = pickle.dumps(data) - n_bytes = len(pickle_out) - with open(file, "wb") as f: - count = 0 - for i in range(0, n_bytes, max_bytes): - f.write(pickle_out[i : min(n_bytes, i + max_bytes)]) - count += 1 - - -def safe_pickle_load(file_path): - """ - Load big pickled files safely by processing them in chunks - This wrapper is a workaround a bug on OSX (https://bugs.python.org/issue24658) - - :param data: data to be loaded through pickle - :param file: file to read from - """ - max_bytes = 2**31 - 1 # Maximum number of bytes to read in one chunk - bytes_in = bytearray(0) - input_size = os.path.getsize(file_path) - with open(file_path, "rb") as f: - for _ in range(0, input_size, max_bytes): - bytes_in += f.read(max_bytes) - return pickle.loads(bytes_in) - - -# =============================================================================== -# Model evaluation helpers -def performance_gain(baseline, current): - """ - Compute the absolute perfomance gain, in Gflop/s between a baseline and a 'current' - :param baseline, current: dictionary, keys: (m, n, k), values: performance in Gflop/s - :return: dictionary, keys: (m, n, k), values: performance difference in Gflop/s - """ - return dict( - zip( - sorted(current.keys()), - [ - current[(m, n, k)] - baseline[(m, n, k)] - for m, n, k in sorted(current.keys()) - ], - ) - ) - - -def relative_performance_gain(baseline, current): - """ - Compute the relative perfomance gain (no units), between a baseline and a 'current' - :param baseline, current: dictionary, keys: (m, n, k), values: performance in Gflop/s - :return: dictionary, keys: (m, n, k), values: relative performance difference (no units) - """ - return dict( - zip( - sorted(current.keys()), - [ - (current[(m, n, k)] - baseline[(m, n, k)]) / baseline[(m, n, k)] - for m, n, k in sorted(current.keys()) - ], - ) - ) - - -def plot_absolute_performance_gain( - perf_gain, mnk_names, baseline_name, current_name, pp=None -): - mnk_products = [ - m * n * k - for m, n, k in sorted(perf_gain.keys(), key=lambda x: x[0] * x[1] * x[2]) - ] - - plt.figure() - plt.plot(mnk_products, list(perf_gain.values()), ".", markersize=3) - plt.plot([mnk_products[0], mnk_products[-1]], [0, 0], "-r") - plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)") - plt.ylabel("Performance Gain [Gflops]") - plt.title( - "Performance gain of " - + current_name - + " VS " - + baseline_name - + " parameter set" - ) - if pp is not None: - pp.savefig() - else: - plt.show() - plt.close() - - -def plot_relative_performance_gain( - rel_perf_gain, mnk_names, baseline_name, current_name, pp=None -): - mnk_products = [ - m * n * k - for m, n, k in sorted(rel_perf_gain.keys(), key=lambda x: x[0] * x[1] * x[2]) - ] - - plt.figure() - plt.plot( - mnk_products, 100 * np.array(list(rel_perf_gain.values())), ".", markersize=3 - ) - plt.plot([mnk_products[0], mnk_products[-1]], [0, 0], "-r") - plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)") - plt.ylabel("Performance Gain [%]") - plt.title( - "Relative performance gain of " - + current_name - + " VS " - + baseline_name - + " parameter set" - ) - if pp is not None: - pp.savefig() - else: - plt.show() - plt.close() - - -def plot_performance_gains( - perf_gain1, perf_gain2, mnk_names, perf_gain1_name, perf_gain2_name, pp=None -): - mnks = [ - (m, n, k) - for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2]) - ] - mnk_products = [ - m * n * k - for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2]) - ] - res1 = [perf_gain1[mnk] for mnk in mnks] - res2 = [perf_gain2[mnk] for mnk in mnks] - - marker_size = 3 - plt.figure() - plt.plot(mnk_products, res1, ".", markersize=marker_size) - plt.plot(mnk_products, res2, ".", color="#d62728", markersize=marker_size) - plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)") - plt.ylabel("Performance [Gflops]") - plt.xscale("log") - plt.legend([perf_gain1_name, perf_gain2_name]) - plt.title( - "Performance of " - + perf_gain1_name - + " and " - + perf_gain2_name - + " parameter set" - ) - if pp is not None: - pp.savefig() - else: - plt.show() - plt.close() - - -def plot_scaled_performance_gains( - perf_gain1, perf_gain2, mnk_names, perf_gain1_name, perf_gain2_name, pp=None -): - mnks = [ - (m, n, k) - for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2]) - ] - mnk_products = [ - m * n * k - for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2]) - ] - res1 = np.array([perf_gain1[mnk] for mnk in mnks]) - res2 = np.array([perf_gain2[mnk] for mnk in mnks]) - - marker_size = 3 - plt.figure() - plt.plot(mnk_products, 100 * res1, ".", markersize=marker_size) - plt.plot(mnk_products, 100 * res2, ".", color="#d62728", markersize=marker_size) - plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)") - plt.ylabel("Scaled performance [%]") - plt.xscale("log") - plt.legend([perf_gain1_name, perf_gain2_name]) - plt.title( - "Performance of " - + perf_gain1_name - + " and " - + perf_gain2_name - + " parameter set" - ) - if pp is not None: - pp.savefig() - else: - plt.show() - plt.close() - - -def plot_choice_goodness( - m, - n, - k, - baseline_performances, - max_performances, - y_true, - y_pred, - train, - pp, - scaled=True, -): - # Sort in ascending performances - data_mnk = pd.DataFrame() - if scaled: - data_mnk["perf_true"] = (100 * y_true).tolist() - data_mnk["perf_pred"] = (100 * y_pred).tolist() - else: - data_mnk["perf_true"] = y_true.flatten().tolist() - data_mnk["perf_pred"] = y_pred.tolist() - data_mnk.sort_values(by="perf_true", inplace=True) - - # Plot - plt.figure() - marker_size = 1 - par_set_ids = range(len(data_mnk.index.values)) - plt.plot( - par_set_ids, - data_mnk["perf_true"], - "b.", - markersize=marker_size, - label="measured performances", - ) - plt.xlabel("Parameter set id") - plt.ylabel("Percentage of autotuned performance achieved [%]") - type = "train" if train else "test" - plt.title( - "Performance profile of parameter sets for " - + str((m, n, k)) - + "-triplet (" - + type - + ")" - ) - - # Annotate - x = [0, len(y_true)] - y = np.array([1, 1]) - perf_num = "{:2.2f}" - - # chosen - idx_perf_chosen = data_mnk["perf_pred"].idxmax() - perf_chosen = data_mnk["perf_true"][idx_perf_chosen] - plt.plot( - x, - perf_chosen * y, - "r-", - label="perf of chosen param set: " + perf_num.format(perf_chosen) + "%", - ) - - # baseline - if scaled: - # baseline = per algo, scale it to 0-1 - perf_baseline = ( - 100 - * baseline_performances[to_string(m, n, k)] - / max_performances["{}x{}x{}".format(m, n, k)] - ) - else: - perf_baseline = baseline_performances[to_string(m, n, k)] - plt.plot( - x, - perf_baseline * y, - "g-", - label="perf of baseline param set: " + perf_num.format(perf_baseline) + "%", - ) - - plt.legend(loc="lower right") - pp.savefig() - plt.close() diff --git a/src/acc/libsmm_acc/predict/predict_train.py b/src/acc/libsmm_acc/predict/predict_train.py deleted file mode 100755 index cf2b3845202..00000000000 --- a/src/acc/libsmm_acc/predict/predict_train.py +++ /dev/null @@ -1,1685 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -#################################################################################################### -# Copyright (C) by the DBCSR developers group - All rights reserved # -# This file is part of the DBCSR library. # -# # -# For information on the license, see the LICENSE file. # -# For further information please visit https://dbcsr.cp2k.org # -# SPDX-License-Identifier: GPL-2.0+ # -#################################################################################################### - -import os -import sys -import datetime -import json -import random -import numpy as np -import pandas as pd -import xgboost as xgb -import dask.dataframe as dd -import matplotlib.pyplot as plt -import argparse -from predict_helpers import ( - safe_pickle, - safe_pickle_load, - plot_choice_goodness, - plot_performance_gains, - plot_scaled_performance_gains, - plot_absolute_performance_gain, - plot_relative_performance_gain, - performance_gain, -) - -sys.path.append("../") -from kernels.smm_predict import to_tuple, to_string # noqa: E402 - -visual_separator = ( - "\n----------------------------------------------------------------------------" -) - - -# =============================================================================== -def main( - datadir, - destdir, - algo, - model_args, - nrows, - prefitted_model_folder, - run_intermediate_evaluation, -): - """ - Train a Machine Learning model on autotuning data to predict a kernel's performance given - its template parameters - """ - # =============================================================================== - # Create folder to store results of this training and start a log - folder, log_file, log = get_log_folder(prefitted_model_folder, destdir, algo) - - # =============================================================================== - # Override algorithm option if working on a pre-fitted model, and log program options - log += print_and_log(visual_separator) - algo, model_args, nrows, log = dump_or_load_options( - algo, model_args, prefitted_model_folder, nrows, folder, log - ) - - # =============================================================================== - # Get maximum and baseline performances - ( - max_performances, - max_performances_algo, - max_performances_ref, - baseline_performances_algo, - ) = get_reference_performances(datadir, algo) - - # =============================================================================== - # Read data - log += print_and_log(visual_separator) - X, X_mnk, Y, log, data_nrows = read_data(algo, datadir, nrows, folder, log) - - # =============================================================================== - # AT THIS POINT, WE MOVE FROM DASK (out-of-memory dataframes) TO PANDAS - # =============================================================================== - log += print_and_log("[moving to pandas] Compute X ...") - X = X.compute() - log += print_and_log("[moving to pandas] Compute Y ...") - Y = Y.compute() - log += print_and_log("[moving to pandas] Compute X_mnk ...") - X_mnk = X_mnk.compute() - log += print_and_log("[moving to pandas] Done") - - # =============================================================================== - # Get or train partial model (i.e. trained on the "training" part of the data, not the entire dataset) - log += print_and_log(visual_separator) - if len(prefitted_model_folder) == 0: # train a model - log += print_and_log("\nPreparing to fit model...") - ( - X_train, - Y_train, - X_mnk_train, - X_test, - Y_test, - X_mnk_test, - model_partial, - log, - ) = train_model(X, X_mnk, Y, algo, model_args, folder, log) - - else: # load pre-trained model - log += print_and_log( - "\nReading partial pre-fitted partial model from " + prefitted_model_folder - ) - ( - X_train, - Y_train, - X_mnk_train, - X_test, - Y_test, - X_mnk_test, - model_partial, - log, - ) = fetch_pre_trained_model_partial( - X, X_mnk, Y, model_args, prefitted_model_folder, log - ) - - # =============================================================================== - # Evaluate partial model - if model_partial is not None: - log = evaluate_model( - model_partial, - X_train, - X_mnk_train, - Y_train, - X_test, - X_mnk_test, - Y_test, - max_performances_ref, - max_performances_algo, - baseline_performances_algo, - data_nrows, - log, - folder, - ) - - # =============================================================================== - # Refit to the entire dataset - # Get or train model fit on the entire dataset (i.e. not just on the "training" part of the data) - model_file = os.path.join(prefitted_model_folder, "feature_tree_refit.p") - if ( - run_intermediate_evaluation - or len(prefitted_model_folder) == 0 - or not os.path.exists(model_file) - ): - log += print_and_log(visual_separator) - log += print_and_log("\nRefit to the entire dataset:") - X = X_train.append(X_test, ignore_index=True) - X_mnk = X_mnk_train.append(X_mnk_test, ignore_index=True) - Y = Y_train.append(Y_test, ignore_index=True) - model_partial.fit(X, Y) - model = ( - model_partial # This model is fit on the entire dataset, it is not partial - ) - results_file = os.path.join(folder, "feature_tree_refit.p") - safe_pickle([X.columns.values, model], results_file) - else: - log += print_and_log( - "\nReading pre-fitted model from " + prefitted_model_folder - ) - X, model, log = fetch_pre_trained_model(prefitted_model_folder, X, log) - - # =============================================================================== - # Evaluate refit-model - log = evaluate_model( - model, - X, - X_mnk, - Y, - None, - None, - None, - max_performances_ref, - max_performances_algo, - baseline_performances_algo, - data_nrows, - log, - folder, - ) - - # =============================================================================== - # Print log - log += print_and_log(visual_separator) - with open(log_file, "w") as f: - f.write(log) - - -# =============================================================================== -# Model hyperparameters -optimized_hyperparameters = { - # chosen by hyperparameter optimization. The optimal parameter depends on the GPU, the data ... - # the values below are the average of the optimal value for the P100 and the V100 - "tiny": { - "scikit_max_depth": 16, - "scikit_min_samples_leaf": 2, - "scikit_min_samples_split": 15, - "xgboost_max_depth": 12, - "xgboost_learning_rate": 0.1, - "xgboost_n_estimators": 100, - }, - "small": { - "scikit_max_depth": 16, - "scikit_min_samples_leaf": 2, - "scikit_min_samples_split": 15, - "xgboost_max_depth": 14, - "xgboost_learning_rate": 0.1, - "xgboost_n_estimators": 170, - }, - "medium": { - "scikit_max_depth": 18, - "scikit_min_samples_leaf": 2, - "scikit_min_samples_split": 13, - "xgboost_max_depth": 14, - "xgboost_learning_rate": 0.1, - "xgboost_n_estimators": 140, - }, - "largeDB1": { - "scikit_max_depth": 18, - "scikit_min_samples_leaf": 2, - "scikit_min_samples_split": 15, - "xgboost_max_depth": 14, - "xgboost_learning_rate": 0.1, - "xgboost_n_estimators": 170, - }, - "largeDB2": { - "scikit_max_depth": 18, - "scikit_min_samples_leaf": 2, - "scikit_min_samples_split": 15, - "xgboost_max_depth": 14, - "xgboost_learning_rate": 0.1, - "xgboost_n_estimators": 170, - }, -} - - -# =============================================================================== -# Printing and dumping helpers -def get_log_folder(prefitted_model_folder, destination_folder, algo): - """Create a unique log folder for this run in which logs, plots etc. will be stored""" - if len(prefitted_model_folder) == 0: - # Create a new folder for this model - file_signature = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M") - folder_name = os.path.join( - "model_selection", os.path.join(algo, file_signature) - ) - if destination_folder != ".": - folder = os.path.join(destination_folder, folder_name) - else: - folder = folder_name - log_file = os.path.join(folder, "log.txt") - if not os.path.exists(folder): - while True: # loop until we've created a folder - try: - os.makedirs(folder) - break - except FileExistsError: - time_stamp_seconds = datetime.datetime.now().strftime("-%S") - new_folder = folder + time_stamp_seconds - print( - "Folder {} exists already. Trying to create folder {}.".format( - folder, new_folder - ) - ) - folder = new_folder - - else: - # If loading a pre-fitted model, use this pre-fitted model's folder as a log folder, but create a new log file - folder = prefitted_model_folder - log_file_signature = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M") - log_file = os.path.join(folder, "log_" + log_file_signature + ".txt") - - # Log folder and file - log = "" - log += print_and_log("\nLogging to:") - log += print_and_log("\t" + folder) - log += print_and_log("\t" + log_file) - - return folder, log_file, log - - -def dump_or_load_options(algo, model_args, prefitted_model, nrows, folder, log): - options_file_name = os.path.join(folder, "options.json") - pgm_options = {"folder": folder, "algo": algo, "nrows": nrows} - pgm_options.update(model_args) - - if len(prefitted_model) == 0: - # if we're training a model, dump options to folder so they can be reloaded in another run - print("Dump options to", options_file_name) - with open(options_file_name, "w") as f: - json.dump(pgm_options, f) - - else: - # if we're using a pre-fitted model, load options from that model - print("Read options from", options_file_name) - with open(options_file_name, "r") as f: - pgm_options = json.load(f) - - algo = pgm_options["algo"] - model_args_list = ["model", "splits", "ntrees", "njobs"] - model_args = dict() - for m in model_args_list: - model_args[m] = pgm_options[m] - nrows = pgm_options["nrows"] - - # Log options - log += print_and_log("Predict-train running with options:") - for opt, opt_val in pgm_options.items(): - log += print_and_log("{:<15}: {}".format(opt, opt_val)) - - return algo, model_args, nrows, log - - -def print_and_log(msg): - if not isinstance(msg, str): - msg = str(msg) - log = "\n" + msg - print(msg) - return log - - -def dask_to_pandas(*dfs): - """Convert training data dask -> pandas""" - pd_dfs = [df.compute() for df in dfs] - return pd_dfs[0] if len(pd_dfs) == 1 else pd_dfs - - -def pandas_to_dask(*dfs): - """Convert training data pandas -> dask""" - dd_dfs = [dd.from_pandas(df, npartitions=3) for df in dfs] - return dd_dfs[0] if len(dd_dfs) == 1 else dd_dfs - - -# =============================================================================== -# Custom loss functions and scorers -def perf_loss(y_true, y_pred, top_k, X_mnk, scaled=True): - """ - Compute the relative performance losses per mnk if one were to measure the top-k best predicted sets of parameters - and pick the best out of this top-k - - :param y_true: ground truth performances (performance scaled between 0 and 1) - :param y_pred: estimated performances (performance scaled between 0 and 1) - :param top_k: number of top performances to measure - :param X_mnk: corresponding mnks - :return: perf_losses: array of relative performance losses (in %), one array element per mnk - """ - assert len(y_true.index) == y_pred.flatten().size - assert len(y_true.index) == len(X_mnk.index) - - perf_losses = list() - mnks = np.unique(X_mnk["mnk"].values) - for mnk in mnks: - # Get performances per mnk - idx_mnk = np.where(X_mnk == mnk)[0].tolist() - assert len(idx_mnk) > 0, "idx_mnk is empty" - y_true_mnk = y_true.iloc[idx_mnk] - y_pred_mnk = y_pred[idx_mnk] - - # Get top-k best predicted performances - if top_k != 1: - top_k_idx = np.argpartition(-y_pred_mnk, top_k)[:top_k] - else: - top_k_idx = np.argmax(y_pred_mnk) - y_correspmax = y_true_mnk.iloc[top_k_idx] - - # Chosen max perf. among predicted max performances - maxperf_chosen = np.amax(y_correspmax) - - # True Max. performances - if not scaled: - maxperf = float(y_true_mnk.max(axis=0)) - assert maxperf >= 0, "Found non-positive value for maxperf: " + str(maxperf) - perf_loss = (maxperf - maxperf_chosen) / maxperf - else: - perf_loss = 1.0 - maxperf_chosen - - # Relative performance loss incurred by using model-predicted parameters instead of autotuned ones [%] - perf_losses.append(100 * perf_loss) - - return perf_losses - - -def worse_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled=True): - y = np.array(perf_loss(y_true, y_pred, top_k, X_mnk, scaled)) - return float(y.max(axis=0)) - - -def mean_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled=True): - y = np.array(perf_loss(y_true, y_pred, top_k, X_mnk, scaled)) - return float(y.mean(axis=0)) - - -def worse_case_scorer(estimator, X, y, top_k): - """ - :param estimator: the model that should be evaluated - :param X: validation data - :param y: ground truth target for X - :return: score: a floating point number that quantifies the estimator prediction quality on X, with reference to y - """ - mnk = dd.DataFrame() - mnk["mnk"] = X["mnk"].copy() - y_pred = estimator.predict(X.drop(["mnk"].values, axis=1)) - score = worse_rel_perf_loss_of_k(y, y_pred, top_k, mnk) - return ( - -score - ) # by scikit-learn convention, higher numbers are better, so the value should be negated - - -def worse_case_scorer_top1(estimator, X, y): - return worse_case_scorer(estimator, X, y, 1) - - -def mean_scorer(estimator, X, y, top_k): - """ - :param estimator: the model that should be evaluated - :param X: validation data - :param y: ground truth target for X - :return: score: a floating point number that quantifies the estimator prediction quality on X, with reference to y - """ - mnk = dd.DataFrame() - mnk["mnk"] = X["mnk"].copy() - y_pred = estimator.predict(X.drop(["mnk"].values, axis=1)) - score = mean_rel_perf_loss_of_k(y, y_pred, top_k, mnk) - return ( - -score - ) # by scikit-learn convention, higher numbers are better, so the value should be negated - - -def mean_scorer_top1(estimator, X, y): - return mean_scorer(estimator, X, y, 1) - - -# =============================================================================== -# Read and prepare data -def get_reference_performances(folder, algo): - import json - - maxperf_file = os.path.join(folder, "max_performances.json") - with open(maxperf_file) as f: - max_performances = json.load(f) - - maxperf_file = os.path.join(folder, "max_performances_by_algo.json") - with open(maxperf_file) as f: - max_performances_algo = json.load(f)[algo] - - max_performances_ref = max_performances - - baseline_file = os.path.join(folder, "baseline_performances_by_algo.json") - with open(baseline_file) as f: - baseline_performances_algo = json.load(f)[algo] - - return ( - max_performances, - max_performances_algo, - max_performances_ref, - baseline_performances_algo, - ) - - -def read_data(algo, read_from, nrows, folder, log): - parquet_data_file = os.path.join(read_from, "training_data_" + algo + ".parquet") - log += print_and_log("\nRead data from " + parquet_data_file) - - # =============================================================================== - # Get 'X' - cols_to_ignore = [ - "perf_scaled", - "mnk", - "perf (Gflop/s)", - "perf_scaled_by_algo", - "perf_squared", - ] - X = dd.read_parquet(parquet_data_file) - cols_to_drop = set(cols_to_ignore).intersection(set(X.columns.values)) - log += print_and_log("\nDropping following columns from X:\n" + str(cols_to_drop)) - X = X.drop(cols_to_drop, axis=1) - log += print_and_log( - "X : {:>8,} x {:>8,} ({:>2.2} MB)".format( - len(X), len(X.columns), sys.getsizeof(X) / 10**6 - ) - ) - log += print_and_log("Head:") - log += print_and_log(X.head()) - n_features = len(list(X.columns)) - predictor_names = X.columns.values - log += print_and_log("\nPredictor variables: (" + str(n_features) + ")") - for i, p in enumerate(predictor_names): - log += print_and_log("\t{:2}) {}".format(i + 1, p)) - - # =============================================================================== - # Get 'Y' - log += print_and_log("\nRead Y") - Y = dd.read_parquet(parquet_data_file, columns=["perf_scaled"]) - log += print_and_log( - "Y : {:>8,} ({:>2.2} MB)".format(len(Y), sys.getsizeof(Y) / 10**6) - ) - log += print_and_log("Head:") - log += print_and_log(Y.head()) - - # =============================================================================== - # Get 'X_mnk' - log += print_and_log("\nRead X_mnk") - X_mnk = dd.read_parquet(parquet_data_file, columns=["mnk"]) - nrows_data = len(X_mnk.index) - log += print_and_log( - "X_mnk : {:>8,} ({:>2.2} MB)".format(nrows_data, sys.getsizeof(X_mnk) / 10**6) - ) - log += print_and_log("Head:") - log += print_and_log(X_mnk.head()) - log += print_and_log("# unique mnks:") - log += print_and_log(str(X_mnk["mnk"].nunique().compute()) + "\n") - - return X, X_mnk, Y, log, nrows_data - - -# =============================================================================== -# Predictive modelling -def get_hyperparameter_grid(algo, model_name, n_features): - # Hyper-parameters to optimize - param_grid = dict() - if "scikit" in model_name: # it is a scikit-learn model - if algo == "medium": - max_depth = [10, 13, 16, 18, 21, 24] - min_samples_split = [2, 8, 12, 18] - min_samples_leaf = [2, 8, 12, 18] - elif algo == "tiny": - step = 1 - max_depth = range(4, int(2 * n_features) + 1, step) - min_samples_split = range(1, 26, step) - min_samples_leaf = range(1, 26, step) - elif algo == "small": - step = 3 - max_depth = range(4, int(2 * n_features) + 1, step) - min_samples_split = [2, 5, 8, 13, 18] - min_samples_leaf = [2, 5, 8, 13, 18] - else: # largeDB1,2 - step = 3 - max_depth = range(4, int(2 * n_features) + 1, step) - min_samples_split = range(2, 21, step) - min_samples_leaf = range(2, 21, step) - param_grid = { - model_name + "__estimator__" + "max_depth": list(max_depth), - model_name + "__estimator__" + "min_samples_split": list(min_samples_split), - model_name + "__estimator__" + "min_samples_leaf": list(min_samples_leaf), - } - elif "xgb" in model_name: # it is an XGBOOST model - if algo == "medium": - max_depth = [16, 13] - n_estimators = [100, 140] - learning_rate = [0.1] - elif algo == "tiny": - max_depth = range(10, n_features + 2, 1) - n_estimators = range(30, 160, 20) - learning_rate = range(1, 5) - learning_rate = [i / 10 for i in learning_rate] - elif algo == "small": - max_max_depth = 20 - max_depth = range(10, min(max_max_depth, n_features + 2), 4) - n_estimators = range(50, 200, 30) - learning_rate = [0.1, 0.3] - else: # largeDB1,2 - max_max_depth = 20 - max_depth = range(10, min(max_max_depth, n_features + 2), 4) - n_estimators = range(50, 200, 30) - learning_rate = [0.1, 0.3] - param_grid = { - "max_depth": list(max_depth), - "learning_rate": list(learning_rate), - "n_estimators": list(n_estimators), - } - else: - raise AssertionError("Cannot recognize model: " + model_name) - - return param_grid - - -def get_scikit_DecisionTree_model(algo): - from sklearn.tree import DecisionTreeRegressor - - model = DecisionTreeRegressor( - criterion="mse", - splitter="best", - min_samples_split=optimized_hyperparameters[algo]["scikit_min_samples_split"], - min_samples_leaf=optimized_hyperparameters[algo]["scikit_min_samples_leaf"], - max_depth=optimized_hyperparameters[algo]["scikit_max_depth"], - max_features=None, - max_leaf_nodes=None, - ) - # Feature selection through permutation importance - from eli5.sklearn import PermutationImportance - - model_perm = PermutationImportance(model, cv=None) - return model_perm, "scikit-Decision_Tree" - - -def get_scikit_RandomForest_model(algo, njobs, ntrees): - from sklearn.ensemble import RandomForestRegressor - - model = RandomForestRegressor( - criterion="mse", - n_estimators=ntrees, - min_samples_split=optimized_hyperparameters[algo]["scikit_min_samples_split"], - min_samples_leaf=optimized_hyperparameters[algo]["scikit_min_samples_leaf"], - max_depth=optimized_hyperparameters[algo]["scikit_max_depth"], - bootstrap=True, - max_features="sqrt", - n_jobs=njobs, - ) - return model, "scikit-Random_Forest" - - -def get_xgb_DecisionTree_model(algo, njobs, ntrees): - params = { - "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"], - "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"], - "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"], - "tree_method": "exact", - "verbosity": 2, - "objective": "reg:squarederror", - "booster": "gbtree", - "n_jobs": njobs, - } - model = xgb.XGBRegressor(**params) - return model, "xgb-Decision_Tree" - - -def get_xgb_DecisionTree_dask_model(algo, njobs, ntrees): - params = { - "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"], - "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"], - "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"], - "tree_method": "exact", - "verbosity": 2, - "objective": "reg:squarederror", - "booster": "gbtree", - "n_jobs": njobs, - } - from dask_ml.xgboost import XGBRegressor_dask - - model = XGBRegressor_dask(**params) - return model, "xgb-Decision_Tree_dask" - - -def get_xgb_DecisionTree_GPU_model(algo, njobs, ntrees): - params = { - "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"], - "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"], - "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"], - "tree_method": "gpu_hist", - "verbosity": 2, - "objective": "reg:squarederror", - "booster": "gbtree", - "n_jobs": njobs, - } - model = xgb.XGBRegressor(**params) - return model, "xgb-Decision_Tree_GPU" - - -def get_xgb_RandomForest_model(algo, njobs, ntrees): - params = { - "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"], - "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"], - "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"], - "tree_method": "exact", - "nthread": njobs, - "subsample": 0.5, - "colsample_bynode": 0.8, - "num_parallel_tree": ntrees, - "verbosity": 2, - "objective": "reg:squarederror", - } - model = xgb.XGBRFRegressor(**params) - return model, "xgb-Random_Forest" - - -def get_model(model_to_train, algo, njobs, ntrees): - if model_to_train == "DT": - model, model_name = get_scikit_DecisionTree_model(algo) - elif model_to_train == "RF": - model, model_name = get_scikit_RandomForest_model(algo, njobs, ntrees) - elif model_to_train == "xgb-DT": - model, model_name = get_xgb_DecisionTree_model(algo, njobs, ntrees) - elif model_to_train == "xgb-DT-dask": - model, model_name = get_xgb_DecisionTree_dask_model(algo, njobs, ntrees) - elif model_to_train == "xgb-DT-GPU": - model, model_name = get_xgb_DecisionTree_GPU_model(algo, njobs, ntrees) - elif model_to_train == "xgb-RF": - model, model_name = get_xgb_RandomForest_model(algo, njobs, ntrees) - else: - raise AssertionError( - "Cannot recognize model: " + model_to_train + ". Options: DT, RF" - ) - return model, model_name - - -def get_train_test_partition(to_partition, test, train=None): - """ - Perform train/test partition - :param to_partition: sequence of objects to partition - :param test: ndarray, test-indices - :param train (optional): ndarray - :return: - """ - if train is None: # Retrieve training indices - all_indices = set(range(len(to_partition[0].index))) - train = list(all_indices - set(test)) - - print( - "About to partition into train (len: {:,}) / test (len: {:,})".format( - len(train), len(test) - ) - ) - partitioned = list() - for df in to_partition: - df_train = df.iloc[ - train, : - ] # train: use for hyper-parameter optimization (via CV) and training - partitioned.append(df_train) - df_test = df.iloc[ - test, : - ] # test : use for evaluation of 'selected/final' model - partitioned.append(df_test) - - print("Returning object of length: {}".format(len(partitioned))) - return partitioned - - -def train_model(X, X_mnk, Y, algo, model_options, folder, log): - # =============================================================================== - # Get options - results_file = os.path.join(folder, "feature_tree.p") - - # =============================================================================== - # Testing splitter (train/test-split) - from sklearn.model_selection import GroupShuffleSplit - - cv = GroupShuffleSplit(n_splits=2, test_size=0.2) - train_test_splits = cv.split(X, Y, groups=X_mnk["mnk"]) - train, test = next(train_test_splits) - ( - X_train, - X_test, - Y_train, - Y_test, - X_mnk_train, - X_mnk_test, - ) = get_train_test_partition([X, Y, X_mnk], test, train) - plot_train_test_partition(test, train, X_mnk, folder) - log += print_and_log( - "\nComplete train/test split, total size=" - + str(X.shape) - + ", test size=" - + str(X_test.shape) - + ", train_size=" - + str(X_train.shape) - ) - del X, X_mnk, Y # free memory - predictor_names = X_train.columns.values - - # =============================================================================== - # Predictive model - model_to_train = model_options["model"] - model, model_name = get_model( - model_to_train, algo, model_options["njobs"], model_options["ntrees"] - ) - log += print_and_log( - "\nStart tune/train for model " + model_name + " with parameters:" - ) - log += print_and_log(model) - - # =============================================================================== - # Cross-validation splitter (train/validation-split) - test_size = 0.3 - cv = GroupShuffleSplit(n_splits=model_options["splits"], test_size=test_size) - - # =============================================================================== - # Feature selection: SelectFromModel - from sklearn.feature_selection import SelectFromModel - - feature_importance_threshold = ( - 0.0005 # only remove the features with VERY little importance - ) - model.cv = cv.split(X_train.values, Y_train.values, groups=X_mnk_train.values) - model.fit(X_train.values, Y_train.values) - model_fs = SelectFromModel( - model, threshold=feature_importance_threshold, max_features=None, prefit=True - ) - print(model_fs) - model.cv = None - - # =============================================================================== - # Info on feature selection - all_feature_names = X_train.columns.values.tolist() - feature_support = model_fs.get_support() - features_importances = model.feature_importances_ - feature_name_importance = zip( - all_feature_names, features_importances, feature_support - ) - feature_name_importance = sorted( - feature_name_importance, key=lambda x: x[1], reverse=True - ) - - log += print_and_log(visual_separator) - n_selected_features = np.sum(feature_support) - log += print_and_log("Optimal number of features : {}".format(n_selected_features)) - - # Selected features - log += print_and_log("\nFeatures:") - selected_features = list() - selected_feature_importances = list() - for i, (feat_name, feat_imp, feat_in) in enumerate(feature_name_importance): - in_or_out = "accepted" if feat_in else " x rejected" - log += print_and_log( - "{:>2}) {:<40}, imp: {:>1.3f} {}".format( - i + 1, feat_name, feat_imp, in_or_out - ) - ) - if feat_in: - selected_features.append(feat_name) - selected_feature_importances.append(feat_imp) - plot_feature_importance(features_importances, all_feature_names, folder) - - # Drop non-selected features - features_to_drop = [f for f in predictor_names if f not in selected_features] - X_train = X_train.drop(features_to_drop, axis=1) - X_test = X_test.drop(features_to_drop, axis=1) - n_features = len(X_train.columns) - - # =============================================================================== - # Fit - out_of_memory_computation = "dask" in model_options["model"] - if out_of_memory_computation: - X_train, Y_train = pandas_to_dask(X_train, Y_train) - - if model_options["hyperparameter_optimization"]: - # Hyperparameter Optimization - param_grid = get_hyperparameter_grid(algo, model_name, n_features) - if param_grid is None: - raise AssertionError("param_grid object is None. Please implement!") - - # At this point, we "cheat"/"take a shortcut" in 2 ways: - # - we split into train/test partitions using the simple default splitter, not one that is aware of mnk-groups - # - we use an overall MSE scorer, not one that looks at the performance loss of predicted mnks wrt. autotuned - if out_of_memory_computation: - from dask_ml.model_selection import GridSearchCV - - gds_pars = { - "estimator": model, - "param_grid": param_grid, - "cv": model_options["splits"], - "refit": True, - "n_jobs": 1, - } - else: - from sklearn.model_selection import GridSearchCV - - gds_pars = { - "estimator": model, - "param_grid": param_grid, - "cv": model_options["splits"], - "refit": True, - "n_jobs": 1, - "verbose": 2, - } - gds = GridSearchCV(**gds_pars) - log += print_and_log(visual_separator) - log += print_and_log("\nStart hyperparameter optimization & training ... :\n") - log += print_and_log("Hyper-parameter grid:") - for par, values in param_grid.items(): - log += print_and_log("\t" + par + ": " + str(values)) - log += print_and_log("\n") - gds.fit(X_train.values, Y_train.values) - log += print_and_log("... done") - describe_hpo(gds, log, folder) - model = gds.best_estimator_ - - else: - # Fit - log += print_and_log(visual_separator) - log += print_and_log("\nStart fitting model with predictors:\n") - for i, p in enumerate(X_train.columns.values): - log += print_and_log("\t{:>2}) {}".format(i + 1, p)) - - model.fit(X_train, Y_train) - - safe_pickle([X_train.columns.values, model, test], results_file) - log += print_and_log("\nCompleted fit, wrote results to " + results_file) - log += print_and_log(visual_separator) - return_model = model - - # Return - if "mnk" in X_train.columns.values: - X_train.drop("mnk", axis=1, inplace=True) - if "mnk" in X_test.columns.values: - X_train.drop("mnk", axis=1, inplace=True) - - if out_of_memory_computation: - X_train, Y_train = dask_to_pandas(X_train, Y_train) - - return X_train, Y_train, X_mnk_train, X_test, Y_test, X_mnk_test, return_model, log - - -def fetch_pre_trained_model(model_path_folder, X, log): - model_path = os.path.join(model_path_folder, "feature_tree_refit.p") - print("fetched pre-trained model from: {}".format(model_path)) - features, model = safe_pickle_load(model_path) - print("Pickled variables:\nfeatures:{}\nmodel:{}".format(features, model)) - - log += print_and_log("\nDrop non-selected features") - predictor_names = X.columns.values.tolist() - features_to_drop = [f for f in predictor_names if f not in features] - X.drop(features_to_drop, axis=1, inplace=True) - return X, model, log - - -def fetch_pre_trained_model_partial(X, X_mnk, Y, model_options, model_path_folder, log): - # Load pre-trained model, selected features and indices of test-set - model_path = os.path.join(model_path_folder, "feature_tree.p") - print("fetched partial pre-trained model from: {}".format(model_path)) - features, model, test_indices = safe_pickle_load(model_path) - print( - "Pickled stuff:\nfeatures:{}\nmodel:{}\ntest_indices:{}".format( - features, model, test_indices - ) - ) - if "mnk" in features: - features.remove("mnk") - - log += print_and_log("\nPerform train/test split") - ( - X_train, - X_test, - Y_train, - Y_test, - X_mnk_train, - X_mnk_test, - ) = get_train_test_partition([X, Y, X_mnk], test_indices) - log += print_and_log( - "\nComplete train/test split, total size=" - + str(X.shape) - + ", test size=" - + str(X_test.shape) - + ", train_size=" - + str(X_train.shape) - ) - - log += print_and_log("\nDrop non-selected features") - predictor_names = X_train.columns.values.tolist() - features_to_drop = [f for f in predictor_names if f not in features] - X_train.drop(features_to_drop, axis=1, inplace=True) - X_test.drop(features_to_drop, axis=1, inplace=True) - - out_of_memory_computation = "dask" in model_options["model"] - if out_of_memory_computation: - X_train, Y_train = pandas_to_dask(X_train, Y_train) - - return X_train, Y_train, X_mnk_train, X_test, Y_test, X_mnk_test, model, log - - -# =============================================================================== -# Describe and evaluate model -def describe_hpo(gs, log, folder): - # Scores obtained during hyperparameter optimization - columns_to_print = list() - for par in gs.param_grid.keys(): - columns_to_print.append("param_" + par) - columns_to_print += [ - "mean_test_score", - "std_test_score", - "mean_train_score", - "std_train_score", - ] - log += print_and_log("\nHyperparameter search results (head):") - cv_results = pd.DataFrame(gs.cv_results_)[columns_to_print] - with pd.option_context("display.max_rows", None, "display.max_columns", None): - log += print_and_log(cv_results.head()) - cv_results_path = os.path.join(folder, "hyperparameter_optimization_results.csv") - with open(cv_results_path, "w") as f: - cv_results.to_csv(f, index=False) - log += print_and_log("Wrote hyperparameter results to " + cv_results_path) - - # Best parameter set - log += print_and_log("\nBest parameters set found on development set:") - for bestpar_name, bestpar_value in gs.best_params_.items(): - log += print_and_log("\t{}: {}".format(bestpar_name, bestpar_value)) - - # Best estimator - log += print_and_log("\nBest estimator:") - best_estimator = gs.best_estimator_ - log += print_and_log(best_estimator) - log += print_and_log(visual_separator) - - return log - - -def describe_model(model, X, Y, log): - predictor_names = X.columns.values.tolist() - log += print_and_log("Model:") - log += print_and_log(model) - - log += print_and_log("Predictor variables:") - for p in predictor_names: - log += print_and_log("\t{}".format(p)) - - return log - - -def print_custom_error(y_true, y_pred, X_mnk, log, scaled=True): - result_line = ( - "\tRelative performance loss compared to autotuned max:\n" - + "top-{}: worse: {:>6.3f} [%], mean: {:>6.3f} [%]" - ) - for top_k in [1]: - log += print_and_log( - result_line.format( - top_k, - worse_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled), - mean_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled), - ) - ) - return log - - -def print_error(y_true, y_pred, log): - from sklearn.metrics import mean_absolute_error, mean_squared_error - - result_line = "\tOverall error:\n" + "absolute: {:>6.3f}, mean squared {:>6.3f}" - log += print_and_log( - result_line.format( - mean_absolute_error(y_true, y_pred), mean_squared_error(y_true, y_pred) - ) - ) - return log - - -def scale_back(y_scaled, x_mnk, max_performances, mnk=None): - if mnk is None: - corresponding_maxperf = np.array( - [max_performances[mnk] for mnk in x_mnk["mnk"].values.tolist()] - ) - else: - corresponding_maxperf = max_performances[mnk] - return y_scaled * corresponding_maxperf - - -def plot_train_test_partition(test_idx, train_idx, X_mnk, folder): - import matplotlib.pyplot as plt - - mnks_string_train = X_mnk["mnk"].iloc[train_idx].unique() - mnks_train = to_tuple(*mnks_string_train) - mnks_string_test = X_mnk["mnk"].iloc[test_idx].unique() - mnks_test = to_tuple(*mnks_string_test) - - y_train_product = ( - dict() - ) # keys: m*n*k, values: how many times this mnk-product appears in training-mnks - for m, n, k in mnks_train: - mxnxk = m * n * k - if mxnxk in y_train_product.keys(): - y_train_product[mxnxk] += 1 - else: - y_train_product[mxnxk] = 1 - - train_mnks = list() - train_counts = list() - for mnk, count in y_train_product.items(): - for c in range(count): - train_mnks.append(mnk) - train_counts.append(c + 1) - - y_test_product = dict() - for m, n, k in mnks_test: - mxnxk = m * n * k - if mxnxk in y_test_product.keys(): - y_test_product[mxnxk] += 1 - else: - y_test_product[mxnxk] = 1 - - test_mnks = list() - test_counts = list() - for mnk, count in y_test_product.items(): - for c in range(count): - test_mnks.append(mnk) - if mnk in y_train_product.keys(): - test_counts.append(y_train_product[mnk] + c + 1) - else: - test_counts.append(c + 1) - - plt.figure(figsize=(30, 5)) - markersize = 12 - plt.plot( - train_mnks, - train_counts, - "o", - markersize=markersize, - color="blue", - label="training mnks (" + str(len(train_mnks)) + ")", - ) - plt.plot( - test_mnks, - test_counts, - "o", - markersize=markersize, - color="red", - label="testing mnks (" + str(len(test_mnks)) + ")", - ) - plot_file_path = os.path.join(folder, "train-test_split.svg") - plt.xlabel("m * n * k triplets") - plt.ylabel("number of occurences in data set") - plt.title("Train/test split") - maxcount = max(max(test_counts), max(train_counts)) + 1 - plt.ylim([0, maxcount]) - plt.legend() - plt.savefig(plot_file_path) - - -def plot_feature_importance(importances, names, folder): - plt.rcdefaults() - fig, ax = plt.subplots() - - ax.set_title("Feature importances") - ax.barh(range(len(names)), importances, color="g", align="center") - ax.set_yticks(np.arange(len(importances))) - ax.set_yticklabels(names) - ax.invert_yaxis() - plot_file_path = os.path.join(folder, "feature_importance.svg") - plt.savefig(plot_file_path) - print(plot_file_path) - - -def plot_loss_histogram(y_true, y_pred, X_mnk, folder): - import matplotlib.pyplot as plt - - # Get losses - top_k = 1 - y = np.array(perf_loss(y_true, y_pred, top_k, X_mnk, False)) - - # Losses-histogram - num_bins = 100 - plt.figure() - plt.hist(y, num_bins, facecolor="green", alpha=0.75) - plt.xlabel("relative performance loss [%]") - plt.ylabel("# occurrences") - plt.title( - "Performance losses for top-k=" - + str(top_k) - + " (" - + str(len(y)) - + " test mnks)" - ) - plot_file_path = os.path.join(folder, "result_losses.svg") - plt.savefig(plot_file_path) - print(plot_file_path) - - -def plot_prediction_accuracy(m, n, k, y_true, y_pred, train, pp): - plt.figure() - if train: - plt.plot(100 * y_true, 100 * y_pred, "b.", label="truth") - else: - plt.plot(100 * y_true, 100 * y_pred, "r.", label="truth") - plt.xlabel("true scaled performance [%]") - plt.ylabel("predicted scaled performance [%]") - type = "train" if train else "test" - plt.title("Prediction accuracy for kernel " + str((m, n, k)) + " (" + type + ")") - pp.savefig() - - -def get_predive_model_performances( - y_true, y_pred, x_mnk, max_performances_ref, max_performances_algo -): - predictive_model_perf_scaled = dict() - - for mnk_string in x_mnk["mnk"].unique(): - idx_mnk = np.where(x_mnk == mnk_string)[0].tolist() - assert len(idx_mnk) > 0, "idx_mnk is empty" - m, n, k = to_tuple(mnk_string) - - perf_chosen_idx = [np.argmax(y_pred[idx_mnk])] - perf_effective = y_true.iloc[idx_mnk].iloc[perf_chosen_idx].values.item() - predictive_model_perf_scaled[(m, n, k)] = ( - perf_effective # 'scaled' between 0 and 1 - ) - - predictive_model_perf = dict( - zip( - predictive_model_perf_scaled.keys(), - [ - perf_scaled * max_performances_ref[to_string(mnk)] - for mnk, perf_scaled in predictive_model_perf_scaled.items() - ], - ) - ) - - # Re-scale performances by algorithm for a fair comparison - predictive_model_perf_scaled = dict( - zip( - predictive_model_perf.keys(), - [ - perf / max_performances_algo[mnk] - for mnk, perf in predictive_model_perf.items() - ], - ) - ) - - return predictive_model_perf, predictive_model_perf_scaled - - -# =============================================================================== -def evaluate_model( - model, - X_train, - X_mnk_train, - Y_train, - X_test, - X_mnk_test, - Y_test, - max_performances_ref, - max_performances_algo, - baseline_performances_algo, - data_nrows, - log, - folder, -): - """Main evaluation function""" - if model is None: - return log - - # Start evaluation - log += print_and_log(visual_separator) - log += print_and_log("Start model evaluation") - if all([x is not None for x in [X_test, Y_test]]): - log = describe_model(model, X_test, Y_test, log) - - # Training error - if all([x is not None for x in [X_train, X_mnk_train, Y_train]]): - y_train_pred = model.predict(X_train.values) - log += print_and_log("\nTraining error: (train&val)") - log = print_custom_error(Y_train, y_train_pred, X_mnk_train, log, True) - log = print_error(Y_train, y_train_pred, log) - - # Test error - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - y_test_pred = model.predict(X_test) - log += print_and_log("\nTesting error:") - log = print_custom_error(Y_test, y_test_pred, X_mnk_test, log, True) - log = print_error(Y_test, y_test_pred, log) - - # Training error (scaled-back) - if all([x is not None for x in [X_train, X_mnk_train, Y_train]]): - log += print_and_log("\nTraining error (scaled back): (train&val)") - y_train_pred_scaled_back = scale_back( - y_train_pred, X_mnk_train, max_performances_ref - ) - y_train_scaled_back = pd.DataFrame( - scale_back(Y_train.values.flatten(), X_mnk_train, max_performances_ref) - ) - log = print_custom_error( - y_train_scaled_back, y_train_pred_scaled_back, X_mnk_train, log, False - ) - log = print_error(y_train_scaled_back, y_train_pred_scaled_back, log) - - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - # Test error (scaled-back) - log += print_and_log("\nTesting error (scaled back): (test&val)") - y_test_pred_scaled_back = scale_back( - y_test_pred, X_mnk_test, max_performances_ref - ) - y_test_scaled_back = pd.DataFrame( - scale_back(Y_test.values.flatten(), X_mnk_test, max_performances_ref) - ) - log = print_custom_error( - y_test_scaled_back, y_test_pred_scaled_back, X_mnk_test, log, False - ) - log = print_error(y_test_scaled_back, y_test_pred_scaled_back, log) - - # =============================================================================== - # Print histogram for "best" estimator - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - log += print_and_log("\nPlot result histogram:") - plot_loss_histogram(Y_test, y_test_pred, X_mnk_test, folder) - - # =============================================================================== - # Plot prediction accuracy and goodness of choice for a few mnks (training-set) - if all([x is not None for x in [X_train, X_mnk_train, Y_train]]): - n_samples = 10 if data_nrows < 100000000 else 2 - mnks_to_plot = random.sample(X_mnk_train["mnk"].values.tolist(), n_samples) - - from matplotlib.backends.backend_pdf import PdfPages - - plot_file_path = os.path.join(folder, "evaluation_by_mnk_refit.pdf") - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - plot_file_path = os.path.join(folder, "evaluation_by_mnk.pdf") - pp = PdfPages(plot_file_path) - - for mnk_string in mnks_to_plot: - # Get performances per mnk - idx_mnk = np.where(X_mnk_train == mnk_string)[0].tolist() - assert len(idx_mnk) > 0, "idx_mnk is empty" - m_, n_, k_ = to_tuple(mnk_string) - y_train_pred_mnk = y_train_pred[idx_mnk] - Y_train_mnk = Y_train.iloc[idx_mnk] - - log += print_and_log("Prediction accuracy plot: " + str(mnk_string)) - - plot_prediction_accuracy( - m_, n_, k_, Y_train_mnk, y_train_pred_mnk, True, pp - ) - - log += print_and_log("Goodness plot: " + str(mnk_string)) - plot_choice_goodness( - m_, - n_, - k_, - baseline_performances_algo, - max_performances_ref, - Y_train["perf_scaled"].iloc[idx_mnk].values, - y_train_pred_mnk, - True, - pp, - ) - - # =============================================================================== - # Plot prediction accuracy for a few mnks (testing-set) - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - mnks_to_plot = random.sample(X_mnk_test["mnk"].values.tolist(), n_samples) - for mnk_string in mnks_to_plot: - # Get performances per mnk - idx_mnk = np.where(X_mnk_test == mnk_string)[0].tolist() - assert len(idx_mnk) > 0, "idx_mnk is empty" - m_, n_, k_ = to_tuple(mnk_string) - - log += print_and_log("Prediction accuracy plot: " + str(mnk_string)) - plot_prediction_accuracy( - m_, n_, k_, Y_test.iloc[idx_mnk], y_test_pred[idx_mnk], False, pp - ) - - log += print_and_log("Goodness plot: " + str(mnk_string)) - plot_choice_goodness( - m_, - n_, - k_, - baseline_performances_algo, - max_performances_ref, - Y_test["perf_scaled"].iloc[idx_mnk].values, - y_test_pred[idx_mnk], - False, - pp, - True, - ) - - if all([x is not None for x in [X_train, X_mnk_train, Y_train]]): - pp.close() - - # =============================================================================== - # Scale baseline and max performances - max_performances_algo = dict( - zip( - [to_tuple(mnk_string) for mnk_string in max_performances_algo.keys()], - max_performances_algo.values(), - ) - ) - max_performances_algo_scaled = dict( - zip(max_performances_algo.keys(), [1.0] * len(max_performances_algo)) - ) - baseline_performances_algo = dict( - zip( - [to_tuple(mnk_string) for mnk_string in baseline_performances_algo.keys()], - baseline_performances_algo.values(), - ) - ) - baseline_performances_algo_scaled = dict( - zip( - [(m, n, k) for m, n, k in baseline_performances_algo.keys()], - [ - perf / max_performances_algo[(m, n, k)] - for (m, n, k), perf in baseline_performances_algo.items() - ], - ) - ) - - # =============================================================================== - # Compare max performances and baseline - from matplotlib.backends.backend_pdf import PdfPages - - plot_file_path = os.path.join(folder, "evaluation_by_overall_refit.pdf") - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - plot_file_path = os.path.join(folder, "evaluation_overall.pdf") - pp = PdfPages(plot_file_path) - - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - plot_performance_gains( - max_performances_algo, - baseline_performances_algo, - "trained", - "max. performance per algorithm", - "baseline per algorithm", - pp, - ) - plot_scaled_performance_gains( - max_performances_algo_scaled, - baseline_performances_algo_scaled, - "trained", - "max. performance per algorithm", - "baseline per algorithm", - pp, - ) - - # =============================================================================== - # 'Results' = y_true ( y_chosen ) - if all([x is not None for x in [X_train, X_mnk_train, Y_train]]): - ( - predictive_model_perf_train, - predictive_model_perf_train_scaled, - ) = get_predive_model_performances( - Y_train, - y_train_pred, - X_mnk_train, - max_performances_ref, - max_performances_algo, - ) - - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - ( - predictive_model_perf_test, - predictive_model_perf_test_scaled, - ) = get_predive_model_performances( - Y_test, - y_test_pred, - X_mnk_test, - max_performances_ref, - max_performances_algo, - ) - - # =============================================================================== - # Plot results (training set: predictive modelling VS naïve) - log += print_and_log("\nPredictive model VS baseline: ") - - if all([x is not None for x in [X_train, X_mnk_train, Y_train]]): - perf_gain_pred_train_over_baseline = performance_gain( - baseline_performances_algo, predictive_model_perf_train - ) - plot_absolute_performance_gain( - perf_gain_pred_train_over_baseline, - "trained", - "baseline per algorithm", - "predictive model", - pp, - ) - - scaled_perf_gain_pred_train_over_baseline = performance_gain( - baseline_performances_algo_scaled, predictive_model_perf_train_scaled - ) - plot_relative_performance_gain( - scaled_perf_gain_pred_train_over_baseline, - "trained", - "baseline per algorithm", - "predictive model", - pp, - ) - - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - perf_gain_pred_test_over_baseline = performance_gain( - baseline_performances_algo, predictive_model_perf_test - ) - plot_absolute_performance_gain( - perf_gain_pred_test_over_baseline, - "tested", - "baseline per algorithm", - "predictive model", - pp, - ) - - scaled_perf_gain_pred_test_over_baseline = performance_gain( - baseline_performances_algo_scaled, predictive_model_perf_test_scaled - ) - plot_relative_performance_gain( - scaled_perf_gain_pred_test_over_baseline, - "tested", - "baseline per algorithm", - "predictive model", - pp, - ) - - log += print_and_log("\nPredictive model VS autotuned: ") - perf_gain_pred_train_over_max = performance_gain( - max_performances_algo, predictive_model_perf_train - ) - plot_absolute_performance_gain( - perf_gain_pred_train_over_max, - "trained", - "max. performance per algorithm", - "predictive model", - pp, - ) - scaled_perf_gain_pred_train_over_max = performance_gain( - max_performances_algo_scaled, predictive_model_perf_train_scaled - ) - plot_relative_performance_gain( - scaled_perf_gain_pred_train_over_max, - "trained", - "max. performance per algorithm", - "predictive model", - pp, - ) - - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - perf_gain_pred_test_over_max = performance_gain( - max_performances_algo, predictive_model_perf_test - ) - plot_absolute_performance_gain( - perf_gain_pred_test_over_max, - "tested", - "max. performance per algorithm", - "predictive model", - pp, - ) - scaled_perf_gain_pred_test_over_max = performance_gain( - max_performances_algo_scaled, predictive_model_perf_test_scaled - ) - plot_relative_performance_gain( - scaled_perf_gain_pred_test_over_max, - "tested", - "max. performance per algorithm", - "predictive model", - pp, - ) - - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - log += print_and_log("\nCompare performances: ") - plot_performance_gains( - baseline_performances_algo, - predictive_model_perf_train, - "trained", - "baseline per algorithm", - "predictive model", - pp, - ) - plot_performance_gains( - max_performances_algo, - predictive_model_perf_train, - "trained", - "max. performance per algorithm", - "predictive model", - pp, - ) - - if all([x is not None for x in [X_test, X_mnk_test, Y_test]]): - plot_performance_gains( - baseline_performances_algo, - predictive_model_perf_test, - "tested", - "baseline per algorithm", - "predictive model", - pp, - ) - plot_performance_gains( - max_performances_algo, - predictive_model_perf_test, - "tested", - "max. performance per algorithm", - "predictive model", - pp, - ) - - pp.close() - - return log - - -# =============================================================================== -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - Train a Machine Learning model on autotuning data to predict a kernel's performance given - its template parameters - - - This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters. - For more details, see README.md. - """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "-d", - "--destination_folder", - metavar="FOLDER", - type=str, - default=".", - help="Folder in which to write plots, models, etc.", - ) - parser.add_argument( - "-f", - "--folder", - metavar="FOLDER", - type=str, - default=".", - help="Folder from which to read data", - ) - parser.add_argument( - "-a", "--algo", metavar="algoname", default="", help="Algorithm to train on" - ) - parser.add_argument( - "-m", - "--model", - default="DT", - help="Model to train. Options: DT (Decision Trees), RF (Random Forests), xgb-DT, xgb-DT-dask (out-of-memory" - + "xgboost), xgb-DT-GPU (with GPU support), xgb-RF", - ) - parser.add_argument( - "-o", - "--hyperparameter_optimization", - default=False, - help="Whether to do hyperparameter optimization. If False, the model will be trained with 'best guess' parameters", - ) - parser.add_argument( - "-s", - "--splits", - default=3, - metavar="NUMBER", - type=int, - help="Number of cross-validation splits used in RFECV and GridSearchCV", - ) - parser.add_argument( - "-e", - "--ntrees", - default=3, - metavar="NUMBER", - type=int, - help="Number of estimators in RF", - ) - parser.add_argument( - "-j", - "--njobs", - default=-1, - metavar="NUMBER", - type=int, - help="Number of parallel jobs that Joblib will launch (used by GridSearchCV and XGBoost)", - ) - parser.add_argument( - "-r", - "--nrows", - default=None, - metavar="NUMBER", - type=int, - help="Number of rows of data to load. Default: None (load all)", - ) - parser.add_argument( - "-g", - "--prefitted_model", - metavar="filename", - default="", - help="Path to pickled model object to load instead of re-training model", - ) - parser.add_argument( - "-i", - "--intermediate_evaluation", - default=False, - help="Whether to perform evaluation of the model trained on part of the model", - ) - parser.set_defaults(intermediate_evaluation=False) - - args = parser.parse_args() - model_args = { - "model": args.model, - "splits": args.splits, - "ntrees": args.ntrees, - "njobs": args.njobs, - "hyperparameter_optimization": args.hyperparameter_optimization, - } - main( - args.folder, - args.destination_folder, - args.algo, - model_args, - args.nrows, - args.prefitted_model, - args.intermediate_evaluation, - ) diff --git a/src/acc/libsmm_acc/predict/prepare_training_data.py b/src/acc/libsmm_acc/predict/prepare_training_data.py deleted file mode 100755 index d8240d9e2d4..00000000000 --- a/src/acc/libsmm_acc/predict/prepare_training_data.py +++ /dev/null @@ -1,832 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -#################################################################################################### -# Copyright (C) by the DBCSR developers group - All rights reserved # -# This file is part of the DBCSR library. # -# # -# For information on the license, see the LICENSE file. # -# For further information please visit https://dbcsr.cp2k.org # -# SPDX-License-Identifier: GPL-2.0+ # -#################################################################################################### - - -import sys -import os -import json -import argparse -import numpy as np -import pandas as pd -import dask.dataframe as dd -from joblib import Parallel, delayed -from tqdm import tqdm - -sys.path.append("../") - -from kernels.smm_acc import kernel_algorithm, mnk_pattern # noqa: E402 -from kernels.smm_acc_predict import ( # noqa: E402 - PredictiveParameters, - derived_parameters, -) - - -# =============================================================================== -# HELPER FUNCTIONS -# =============================================================================== -def update_maximums(dictionnary_to_update, dictionnary_partial): - for mnk, new_perf in dictionnary_partial.items(): - if mnk in dictionnary_to_update.keys(): - if new_perf > dictionnary_to_update[mnk]: - dictionnary_to_update[mnk] = new_perf - else: - dictionnary_to_update[mnk] = new_perf - return dictionnary_to_update - - -# =============================================================================== -def get_idx_baseline(data_mnk, algorithm, baseline_pars): - if algorithm in ["tiny"]: - idx_baseline = data_mnk[ - (data_mnk.m == baseline_pars["m"]) - & (data_mnk.n == baseline_pars["n"]) - & (data_mnk.k == baseline_pars["k"]) - & (data_mnk.threads == baseline_pars["threads"]) - & (data_mnk.grouping == baseline_pars["grouping"]) - & (data_mnk.minblocks == baseline_pars["minblocks"]) - ].index.tolist() - elif algorithm in ["small", "medium"]: - idx_baseline = data_mnk[ - (data_mnk.m == baseline_pars["m"]) - & (data_mnk.n == baseline_pars["n"]) - & (data_mnk.k == baseline_pars["k"]) - & (data_mnk.threads == baseline_pars["threads"]) - & (data_mnk.grouping == baseline_pars["grouping"]) - & (data_mnk.minblocks == baseline_pars["minblocks"]) - & (data_mnk.tile_m == baseline_pars["tile_m"]) - & (data_mnk.tile_n == baseline_pars["tile_n"]) - ].index.tolist() - else: # algorithm is largeDB1 or largeDB2 - idx_baseline = data_mnk[ - (data_mnk.m == baseline_pars["m"]) - & (data_mnk.n == baseline_pars["n"]) - & (data_mnk.k == baseline_pars["k"]) - & (data_mnk.threads == baseline_pars["threads"]) - & (data_mnk.minblocks == baseline_pars["minblocks"]) - & (data_mnk.tile_m == baseline_pars["tile_m"]) - & (data_mnk.tile_n == baseline_pars["tile_n"]) - & (data_mnk.w == baseline_pars["w"]) - & (data_mnk.v == baseline_pars["v"]) - ].index.tolist() - return idx_baseline - - -def get_performance_closest_to_baseline( - data, algorithm, mnk, gpu_properties, autotuning_properties -): - """ - Sometimes, the so-called "baseline" parameter set does not appear in the training data. - This function finds the performance of the parameter set from the training data whose parameters are closest to those of the - baseline parameter sets. - """ - m, n, k = mnk_pattern.match(mnk).groups() - m, n, k = int(m), int(n), int(k) - - data_mnk = data[(data["m"] == m) & (data["n"] == n) & (data["k"] == k)].compute() - baseline_pars = kernel_algorithm[algorithm].baseline( - m, n, k, gpu_properties, autotuning_properties - ) - - # Get performance of baseline parameters for this algorithm & this mnk: - idx_baseline = get_idx_baseline(data_mnk, algorithm, baseline_pars) - - # Get performance of baseline parameters for this algorithm & this mnk: - if len(idx_baseline) == 0: - # Generate space of possibilities - pars_sets = kernel_algorithm[algorithm].promising_parameters( - m, n, k, gpu_properties, autotuning_properties - ) - # Sort space by distance to baseline set - pars_sets.sort( - key=lambda x: kernel_algorithm[algorithm].parameter_set_distance( - x, baseline_pars - ) - ) - - for pars_set in pars_sets: - idx_baseline = get_idx_baseline(data_mnk, algorithm, pars_set) - if len(idx_baseline) > 0: - break - else: - raise AssertionError( - f'Could not find closest baseline for mnk=({m}x{n}x{k}) and for algorithm "{algorithm}.\n' - f"Last baseline parameters searched:\n{baseline_pars}\n" - f"Parameter sets searched:\n" - ) - - idx_baseline = idx_baseline[0] - baseline_perf = data_mnk["perf (Gflop/s)"][idx_baseline] - return round(baseline_perf, 3) - - -def process_chunk(data_chunk, algorithm, gpu_properties, autotuning_properties): - """ - Given a chunk of data, compute the baseline and maximum performance of the (m, n, k)-triplets featured in the chunk of data. - """ - # Add "mnk" column - data_chunk["mnk"] = ( - data_chunk["m"].astype(str) - + "x" - + data_chunk["n"].astype(str) - + "x" - + data_chunk["k"].astype(str) - ) - # Get mnks - mnks = data_chunk["mnk"].unique() - - # For each (mnk), ... - baseline_performances = dict() - max_performances = dict() - for mnk in mnks: - data_mnk = data_chunk[data_chunk["mnk"] == mnk] - m, n, k = mnk_pattern.match(mnk).groups() - m, n, k = int(m), int(n), int(k) - - # Get baseline configuration for this algorithm & this mnk: - baseline_pars = kernel_algorithm[algorithm].baseline( - m, n, k, gpu_properties, autotuning_properties - ) - - # Get performance of baseline parameters for this algorithm & this mnk: - idx_baseline = get_idx_baseline(data_mnk, algorithm, baseline_pars) - if len(idx_baseline) < 1: - baseline_perf = 0 - else: - idx_baseline = idx_baseline[0] - baseline_perf = data_mnk["perf (Gflop/s)"][idx_baseline] - - baseline_performances[mnk] = round(baseline_perf, 3) - - # Get max performance for this algorithm & this mnk - max_perf = data_mnk["perf (Gflop/s)"].max() - max_performances[mnk] = round(max_perf, 3) - - return baseline_performances, max_performances - - -# =============================================================================== -def write_to_parquet(data_path, algorithm): - """ - Compress CSV files to parquet - """ - # Check whether the files corresponding to this algorithm have been compressed to parquet already - parquet_file = os.path.join(data_path, "training_data_" + algorithm + ".parquet") - parquet_file_done = os.path.join( - data_path, "training_data_" + algorithm + ".parquet.done" - ) - print( - "\n\n------------------------------------------------------------------------" - ) - if os.path.exists(parquet_file_done): - print("Found {:40}, skipping".format(parquet_file_done)) - - else: - print("Didn't find {:40}, generating".format(parquet_file_done)) - - # [RAW] Read CSV files into Pandas dataframes - data_file_raw = os.path.join( - data_path, "raw_training_data_" + algorithm + ".csv" - ) - print("\nRead raw data from: {}".format(data_file_raw)) - data_raw = dd.read_csv(data_file_raw) - raw_data_nrows = len(data_raw) - # n_partitions should be > 1 ! - n_partitions = max(1, int(raw_data_nrows // 1e5)) - data_raw = data_raw.repartition(npartitions=n_partitions) - data_raw = data_raw.reset_index().set_index("index") - data_raw["idx"] = 1 - data_raw["idx"] = data_raw.idx.cumsum() - data_raw = data_raw.set_index("idx", sorted=True) - print("Raw data head:\n", data_raw.head()) - - # [DERIVED] Read CSV files into Pandas dataframes - data_file_derived = os.path.join( - data_path, "training_data_" + algorithm + ".csv" - ) - print("\nRead derived data from: {}".format(data_file_derived)) - data_derived = dd.read_csv(data_file_derived) - derived_data_nrows = len(data_derived) - data_derived = data_derived.repartition(npartitions=n_partitions) - data_derived = data_derived.reset_index().set_index("index") - data_derived["idx"] = 1 - data_derived["idx"] = data_derived.idx.cumsum() - data_derived = data_derived.set_index("idx", sorted=True) - print("Derived data head:\n", data_derived.head()) - - # Merge raw/derived data together - print("Merging raw and derived ...") - data = dd.merge(data_raw, data_derived, left_index=True, right_index=True) - - len_data, len_data_raw, len_data_derived = ( - len(data), - raw_data_nrows, - derived_data_nrows, - ) - nrows_message_temp = """ - Data 1 : {:15,}, - Data 2 : {:15,}, - Merged data: {:15,}""" - nrows_message = nrows_message_temp.format( - len_data_raw, len_data_derived, len_data - ) - assert len_data == len_data_raw, "Mismatch in number of rows\n" + nrows_message - assert len_data == len_data_derived, ( - "Mismatch in number of rows\n" + nrows_message - ) - - # Add "mnk" column - data["mnk"] = ( - data["m"].astype(str) - + "x" - + data["n"].astype(str) - + "x" - + data["k"].astype(str) - ) - - # Print info on merged dataset - print("\nMerged data head:", data.head()) - data_nrows = len(data) - nrows_message = """ -Data : {:15,}, -Raw data : {:15,}, -Derived data: {:15,}""".format( - data_nrows, raw_data_nrows, derived_data_nrows - ) - assert data_nrows == raw_data_nrows, ( - "Mismatch in number of rows\n" + nrows_message - ) - assert data_nrows == derived_data_nrows, ( - "Mismatch in number of rows\n" + nrows_message - ) - print(nrows_message) - - # Compress files to Parquet - print("Compress and write to {}".format(parquet_file)) - data.to_parquet(parquet_file, engine="fastparquet", compression="snappy") - open( - parquet_file_done, "w" - ).close() # touch a file to mark that parquet is done - - -# =============================================================================== -def get_non_null(nlist): - """ - Given a list of numbers, return its first positive element, if it exists, zero otherwise. - """ - for e in nlist: - if e > 0: - return e - return 0 - - -def get_max(nlist): - """ - Return the largest element of a list of numbers - """ - return np.array(nlist).max() - - -def list_of_dics_to_dic_of_lists(list_of_dics): - """ - Given a list "list_of_dics" of dictionaries "d", with keys "k" and values "v", - construct a dictionary with keys "k" and values which are lists "[v1, v2, ...]" - of the values corresponding to "k" in the various dictionaries "d" - """ - dic_of_lists = dict() - for dic in list_of_dics: - for k, v in dic.items(): - if k not in dic_of_lists.keys(): - dic_of_lists[k] = list() - dic_of_lists[k].append(v) - return dic_of_lists - - -def dic_of_dics_to_dic_of_lists(dic_of_dics): - dic_of_lists = dict() - for _, dic in dic_of_dics.items(): - for k, v in dic.items(): - if k not in dic_of_lists.keys(): - dic_of_lists[k] = list() - dic_of_lists[k].append(v) - return dic_of_lists - - -def write_baseline_and_max_records_per_algorithm( - data_path, algorithm, arch, n_jobs, chunk_size -): - """ - Write records of baseline performances and maximum performances for the training mnks. - This function reads from the raw data file (`raw_training_data_ALGORITHM.csv`) - Writes to JSON files. - """ - # Read GPU properties and autotuning properties - with open("../kernels/gpu_properties.json") as f: - gpu_properties = json.load(f)[arch] - with open("../kernels/autotuning_properties.json") as f: - autotuning_properties = json.load(f) - - # Check whether record of baseline exists - baseline_performances_per_algo_file = os.path.join( - data_path, "baseline_performances_" + algorithm + ".json" - ) - max_performances_per_algo_file = os.path.join( - data_path, "max_performances_" + algorithm + ".json" - ) - print( - "\n\n------------------------------------------------------------------------" - ) - if os.path.exists(baseline_performances_per_algo_file) and os.path.exists( - max_performances_per_algo_file - ): - print("Found {:40}, skipping".format(baseline_performances_per_algo_file)) - print("Found {:40}, skipping".format(max_performances_per_algo_file)) - - else: - print("Processing data of algorithm {}".format(algorithm)) - raw_pars_cols = kernel_algorithm[algorithm].launch_parameters - if algorithm in ["largeDB1", "largeDB2"]: - raw_pars_cols.remove("grouping") - - data_file_raw = os.path.join( - data_path, "raw_training_data_" + algorithm + ".csv" - ) - baseline_and_maximums_performance_dictionaries = Parallel( - n_jobs=n_jobs, verbose=1 - )( - delayed(process_chunk, check_pickle=True)( - data_chunk, algorithm, gpu_properties, autotuning_properties - ) - for data_chunk in tqdm( - pd.read_csv(data_file_raw, chunksize=chunk_size), disable=True - ) - ) - - baseline_performance_dictionaries, maximums_performance_dictionaries = zip( - *baseline_and_maximums_performance_dictionaries - ) - baseline_performance_dictionary = list_of_dics_to_dic_of_lists( - baseline_performance_dictionaries - ) - assert ( - 0 not in baseline_performance_dictionary.values() - ), "Found a max. performance of 0" - maximums_performance_dictionary = list_of_dics_to_dic_of_lists( - maximums_performance_dictionaries - ) - assert ( - 0 not in maximums_performance_dictionary.values() - ), "Found a baseline performance of 0" - - # Write max performances to files - max_performances = dict() - print("\nComputing maximum performances ...") - for mnk, max_list in maximums_performance_dictionary.items(): - perf = get_max(max_list) - max_performances[mnk] = perf - with open(max_performances_per_algo_file, "w") as f: - json.dump(max_performances, f, indent="\t", sort_keys=True) - print("Wrote maximum performances to:\n", max_performances_per_algo_file) - - # Write baseline performances to files - baseline_performances = dict() - - def get_baseline_performance(mnk, base_list, raw_pars_cols): - perf = get_non_null(base_list) - if perf == 0: - data_file = os.path.join( - data_path, "raw_training_data_" + algorithm + ".csv" - ) - data = dd.read_csv(data_file) - perf = get_performance_closest_to_baseline( - data, algorithm, mnk, gpu_properties, autotuning_properties - ) - return perf - - print("\nComputing baseline performances ...") - baseline_performances_ = Parallel(n_jobs=n_jobs, verbose=1)( - delayed(get_baseline_performance, check_pickle=True)( - mnk, base_list, raw_pars_cols - ) - for mnk, base_list in tqdm( - baseline_performance_dictionary.items(), disable=True - ) - ) - - baseline_performances = dict( - zip(baseline_performance_dictionary.keys(), baseline_performances_) - ) - with open(baseline_performances_per_algo_file, "w") as f: - json.dump(baseline_performances, f, indent="\t", sort_keys=True) - print("Wrote baseline performances to:\n", baseline_performances_per_algo_file) - - -# =============================================================================== -def plot_baseline(baseline_perfs_by_algo, data_path, algorithms): - import re - import matplotlib.pyplot as plt - - print("\nPlotting baseline performances ...") - - # Get all mnks - mnk_sequences = list() - for _algo, baseline_dic in baseline_perfs_by_algo.items(): - mnk_sequences += list(baseline_dic.keys()) - all_mnks = list(set.union(set(mnk_sequences))) - - # Reduce baseline_perfs_by_algo to baseline_perfs - baseline_perfs = dict() - for mnk in all_mnks: - for algo in [ - "medium", - "small", - "largeDB1", - "largeDB2", - "tiny", - ]: # algorithms in order of baseline-ness - if mnk in baseline_perfs_by_algo[algo].keys(): - baseline_perfs[mnk] = baseline_perfs_by_algo[algo][mnk] - break - else: - raise AssertionError( - "NOOOO this is actually impossible by def of all_mnks, isn't it?" - ) - - # Sort - mnks = list() - mnk_str = re.compile(r"(\d+)x(\d+)x(\d+)") - for mnk_s in baseline_perfs.keys(): - match = mnk_str.match(mnk_s) - mnks.append((int(match.group(1)), int(match.group(2)), int(match.group(3)))) - - baseline_performances = zip(mnks, baseline_perfs.values()) - - baseline_performances_sorted = [ - (mnk[0] * mnk[1] * mnk[2], p) - for mnk, p in sorted( - baseline_performances, key=lambda x: x[0][0] * x[0][1] * x[0][2] - ) - ] - mnk_sorted, baseline_perf_sorted = list(zip(*baseline_performances_sorted)) - - # Plot - plt.plot(mnk_sorted, baseline_perf_sorted, ".", markersize=1) - plt.xlabel("(m, n, k) triplets of training data (in order of increasing m*n*k)") - plt.ylabel("Baseline performances (Gflop/s)") - plt.title("Baseline performances on training data") - algorithm_extension = "_" + algorithms[0] if len(algorithms) == 0 else "" - file_name = os.path.join( - data_path, "baseline_performances" + algorithm_extension + ".svg" - ) - plt.savefig(file_name) - print("... wrote to", file_name) - plt.close() - - -def write_baseline_record(data_path, algorithms): - baseline_performances_by_algo_file = os.path.join( - data_path, "baseline_performances_by_algo.json" - ) - if os.path.exists(baseline_performances_by_algo_file): - print("Found {:40}, skipping".format(baseline_performances_by_algo_file)) - with open(baseline_performances_by_algo_file) as f: - baseline_performances_by_algo = json.load(f) - - else: - print( - "File {:40} not found, generating".format( - baseline_performances_by_algo_file - ) - ) - # Get baseline performances by algorithm - baseline_performances_by_algo = dict() - for algorithm in algorithms: - # Read baseline parameters - baseline_performances_per_algo_file = os.path.join( - data_path, "baseline_performances_" + algorithm + ".json" - ) - with open(baseline_performances_per_algo_file, "r") as f: - baseline_algorithm = json.load(f) - # Add to dictionary - baseline_performances_by_algo[algorithm] = baseline_algorithm - - # Write to file - with open(baseline_performances_by_algo_file, "w") as f: - json.dump(baseline_performances_by_algo, f, indent="\t", sort_keys=True) - print("\nWrote baseline performances to:\n", baseline_performances_by_algo_file) - - plot_baseline(baseline_performances_by_algo, data_path, algorithms) - - -def write_max_by_algo_record(data_path, algorithms): - max_performances_by_algo_file = os.path.join( - data_path, "max_performances_by_algo.json" - ) - if os.path.exists(max_performances_by_algo_file): - print("Found {:40}, skipping".format(max_performances_by_algo_file)) - - else: - # Get max performances by algorithm - max_performances_by_algo = dict() - for algorithm in algorithms: - # Read max parameters - max_performances_per_algo_file = os.path.join( - data_path, "max_performances_" + algorithm + ".json" - ) - with open(max_performances_per_algo_file, "r") as f: - max_algorithm = json.load(f) - # Add to dictionary - max_performances_by_algo[algorithm] = max_algorithm - - # Write to file - with open(max_performances_by_algo_file, "w") as f: - json.dump(max_performances_by_algo, f, indent="\t", sort_keys=True) - print( - "\nWrote max performances by algorithm to:\n", max_performances_by_algo_file - ) - - -def plot_max_performances(max_perfs, data_path, algorithms): - import re - import matplotlib.pyplot as plt - - print("\nPlotting max. performances ...") - - mnks = list() - mnk_str = re.compile(r"(\d+)x(\d+)x(\d+)") - for mnk_s in max_perfs.keys(): - match = mnk_str.match(mnk_s) - mnks.append((int(match.group(1)), int(match.group(2)), int(match.group(3)))) - - max_performances = zip(mnks, max_perfs.values()) - max_performances_sorted = [ - (mnk[0] * mnk[1] * mnk[2], p) - for mnk, p in sorted( - max_performances, key=lambda x: x[0][0] * x[0][1] * x[0][2] - ) - ] - mnk_sorted, max_perf_sorted = list(zip(*max_performances_sorted)) - - # Plot - plt.plot(mnk_sorted, max_performances_sorted, ".", markersize=1) - plt.xlabel("(m, n, k) triplets of training data (in order of increasing m*n*k)") - plt.ylabel("Max. performances (Gflop/s)") - plt.title("Maximum performances on training data") - algorithm_extension = "_" + algorithms[0] if len(algorithms) == 0 else "" - file_name = os.path.join( - data_path, "max_performances" + algorithm_extension + ".svg" - ) - plt.savefig(file_name) - print("... wrote to", file_name) - plt.close() - - -def write_max_record(data_path, algorithms): - max_performances_file = os.path.join(data_path, "max_performances.json") - if os.path.exists(max_performances_file): - print("Found {:40}, skipping".format(max_performances_file)) - with open(max_performances_file) as f: - max_performances = json.load(f) - - else: - # Get max performances - max_performances_by_algo = dict() - for algorithm in algorithms: - # Read max parameters - max_performances_per_algo_file = os.path.join( - data_path, "max_performances_" + algorithm + ".json" - ) - with open(max_performances_per_algo_file, "r") as f: - max_algorithm = json.load(f) - # Add to dictionary - max_performances_by_algo[algorithm] = max_algorithm - - # Reduce along max - max_performances_list = dic_of_dics_to_dic_of_lists(max_performances_by_algo) - max_performances = dict() - for mnk, max_list in max_performances_list.items(): - max_performances[mnk] = get_max(max_list) - - # Write to file - with open(max_performances_file, "w") as f: - json.dump(max_performances, f, indent="\t", sort_keys=True) - print("\nWrote max performances to:\n", max_performances_file) - - plot_max_performances(max_performances, data_path, algorithms) - - -def get_derived_pars( - data_path, - i, - data_chunk, - algorithm, - gpu_properties, - autotuning_properties, - max_performances, -): - # Compute derived parameters - data_chunk["algorithm"] = [algorithm] * len( - data_chunk.index - ) # add 'algorithm' column manually - parameter_sets = PredictiveParameters( - data_chunk, gpu_properties, autotuning_properties, max_performances - ) - pars_to_get = derived_parameters["common"] + derived_parameters[algorithm] - new_data = parameter_sets.get_features(pars_to_get) - - # Write to CSV - filename = os.path.join(data_path, "training_data_{}-{}.csv".format(algorithm, i)) - new_data.to_csv(filename, index=False) - - return filename - - -def write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size): - """ - The predictive modelling procedure uses not only the raw parameters as features, but also some - "derived" features computed using algorithm characteristics and hardware knowledge. - This function reads raw parameters from `data_path`, computes derived parameters and writes them - to the same folder. - """ - derived_training_data_filename = os.path.join( - data_path, "training_data_{}.csv".format(algorithm) - ) - print( - "\n\n------------------------------------------------------------------------" - ) - if os.path.exists(derived_training_data_filename): - print("Found {:40}, skipping".format(derived_training_data_filename)) - - else: - print("Didn't find {:40}, generating".format(derived_training_data_filename)) - - # Read max performances, GPU properties and autotuning properties - maxperf_file = os.path.join(data_path, "max_performances.json") - with open(maxperf_file) as f: - max_performances = json.load(f) - with open("../kernels/gpu_properties.json") as f: - gpu_properties = json.load(f)[arch] - with open("../kernels/autotuning_properties.json") as f: - autotuning_properties = json.load(f) - - # Compute derived data from raw data - raw_training_data_filename = os.path.join( - data_path, "raw_training_data_{}.csv".format(algorithm) - ) - print( - "reading raw data from {} and computing derived parameters".format( - raw_training_data_filename - ) - ) - - derived_training_data_filenames = Parallel(n_jobs=n_jobs, verbose=1)( - delayed(get_derived_pars, check_pickle=True)( - data_path, - i, - data_chunk, - algorithm, - gpu_properties, - autotuning_properties, - max_performances, - ) - for i, data_chunk in enumerate( - pd.read_csv(raw_training_data_filename, chunksize=chunk_size) - ) - ) - - # Merge the CSV files (one for each iteration of the above Joblib loop) into one file - assert len(derived_training_data_filenames) > 0, "No training data files" - if len(derived_training_data_filenames) == 1: - # No merging is necessary. Simply rename the file - os.rename( - derived_training_data_filenames[0], derived_training_data_filename - ) - - else: - with open(derived_training_data_filename, "w") as out: - # Write the first file, including its header - fn_1 = derived_training_data_filenames.pop(0) - with open(fn_1) as f: - out.write(f.read()) - os.remove(fn_1) - # Write the rest of the files, skipping the header line each time - for i, fn in enumerate(derived_training_data_filenames): - print( - "writing from {} ({}/{})".format( - fn, i + 1, len(derived_training_data_filenames) - ) - ) - with open(fn) as f: - next(f) # skip header line - out.write(f.read()) - # Delete the file we just merged - os.remove(fn) - - print("\tWrote", derived_training_data_filename) - - -# =============================================================================== -def main(data_path, algorithms_to_prep, arch, n_jobs, chunk_size, skip_derived_data): - # =============================================================================== - # Write baseline and maximum performance records - for algorithm in algorithms_to_prep: - write_baseline_and_max_records_per_algorithm( - data_path, algorithm, arch, n_jobs, chunk_size - ) - - if set(algorithms_to_prep) == set(kernel_algorithm.keys()): - write_baseline_record(data_path, algorithms_to_prep) - write_max_by_algo_record(data_path, algorithms_to_prep) - write_max_record(data_path, algorithms_to_prep) - - # =============================================================================== - if not skip_derived_data: - for algorithm in algorithms_to_prep: - write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size) - write_to_parquet(data_path, algorithm) - - -# =============================================================================== -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description=""" - Prepare the data collected with autotuning for training, - After downloading raw data from the dedicated repository, use this script to - - Record maximum and baseline performances of (m,n,k)-triplets in JSON files - - Compute derived training data and write it to a CSV file - - Compress training data csv files to parquet file format - - - This script is part of the workflow for predictive modelling of optimal libcusmm parameters. - For more details, see predict.md - """, - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument( - "-f", - "--folder", - metavar="FOLDER", - type=str, - default=".", - help="Path to the data to be converted to parquet.", - ) - parser.add_argument( - "-l", - "--algorithm", - metavar="ALGORITHM", - default="", - help="Algorithms to prepare", - ) - parser.add_argument( - "-a", - "--arch", - metavar="ARCHITECTURE", - type=str, - default="sm_80", - help="CUDA architecture number. Options: sm_35, sm_37, sm_60, sm_70, sm_80, gfx906", - ) - parser.add_argument( - "-j", - "--njobs", - default=-1, - metavar="NUMBER", - type=int, - help="Number of parallel jobs that Joblib will launch. If you run into out-of-memory errors, reduce this.", - ) - parser.add_argument( - "-c", - "--chunk_size", - type=int, - default=20000, - help="Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number", - ) - parser.add_argument( - "-s", - "--skip_derived_data", - type=bool, - default=False, - help=( - "Skip the computation of derived data. Set to true if computing baseline & max records for " - "each algorithm separately" - ), - ) - - args = parser.parse_args() - algorithms_to_prep = ( - kernel_algorithm.keys() if args.algorithm == "" else [args.algorithm] - ) - main( - args.folder, - algorithms_to_prep, - args.arch, - args.njobs, - args.chunk_size, - args.skip_derived_data, - ) diff --git a/src/acc/libsmm_acc/predict/requirements.txt b/src/acc/libsmm_acc/predict/requirements.txt deleted file mode 100644 index a9187ccbc03..00000000000 --- a/src/acc/libsmm_acc/predict/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ -eli5>=0.8.1 -joblib>=0.13.1, <=0.17.0 -tqdm>=4.32.2 -matplotlib>=3.0.2 -numpy>=1.16.0 -pandas>=0.23.4 -scikit-learn>=0.20.2 -dask[dataframe]>=2021.10.0 -xgboost>=0.90 -fastparquet>=0.3.1 -python-snappy>=0.5.4 diff --git a/src/acc/libsmm_acc/tune/README.md b/src/acc/libsmm_acc/tune/README.md index 01b00710f83..96c8571e12e 100644 --- a/src/acc/libsmm_acc/tune/README.md +++ b/src/acc/libsmm_acc/tune/README.md @@ -65,7 +65,6 @@ The `tune_setup.py` script generates job files. You have to adapt the script to output += "date\n" ... - ... ``` @@ -235,11 +234,7 @@ Wrote parameters.new.json The file `parameters.new.json` can now be used as a parameter file. Rename it to `parameters_GPU.json`, with the appropriate `GPU`. -#### 8. (optional) Explore the data - -Explore the data interactively using the [provided Jupyter Notebook](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb). - -#### 9. Contribute parameters to the community +#### 8. Contribute parameters to the community **Contribute new optimal parameters** diff --git a/src/acc/opencl/Makefile b/src/acc/opencl/Makefile index 0cbefd97d69..244a7b2692f 100644 --- a/src/acc/opencl/Makefile +++ b/src/acc/opencl/Makefile @@ -73,23 +73,15 @@ ifneq (,$(ELEM_TYPE)) CFLAGS += -DELEM_TYPE=$(ELEM_TYPE) endif -ifeq (1,$(INTEL)) - CXX := icpc - CC := icc - AR := xiar -else ifneq (0,$(INTEL)) - CXX := icpx - CC := icx - AR := xiar -else ifneq (0,$(GNU)) - override CXX := g++ - override CC := gcc - ifneq (Darwin,$(UNAME)) - override AR := gcc-ar +ifneq (0,$(INTEL)) + ifneq (1,$(INTEL)) + CXX := icpx + CC := icx else - override AR := ar + CXX := icpc + CC := icc endif - #override LD_LIBRARY_DIRS := $(NULL) + AR := $(if $(call which,xiar),xiar,ar) else CXX := g++ CC := gcc @@ -109,13 +101,14 @@ ifneq (0,$(DEV)) CFLAGS += -D__DBCSR_ACC CFLAGS += -Wno-deprecated -Werror ifneq (2,$(DEV)) + $(info DEBUG: $(CC) $(CXX)) ifneq (,$(findstring clang,$(CC) $(CXX))) override CC := clang++ --analyze else - CC := $(CXX) -xc++ + override CC := $(CXX) -xc++ endif else - CC := $(CXX) -xc++ + override CC := $(CXX) -xc++ endif $(info CC: $(shell $(CC) --version | head -n1)) OMP := 0 diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c index 6ea4f1d1f83..d7ee7651184 100644 --- a/src/acc/opencl/acc_opencl.c +++ b/src/acc/opencl/acc_opencl.c @@ -217,8 +217,8 @@ int c_dbcsr_acc_init(void) { cl_platform_id platforms[ACC_OPENCL_MAXNDEVS] = {NULL}; cl_device_id devices[ACC_OPENCL_MAXNDEVS]; char buffer[ACC_OPENCL_BUFFERSIZE]; + const char *const env_devsplit = getenv("ACC_OPENCL_DEVSPLIT"), *const env_priority = getenv("ACC_OPENCL_PRIORITY"); const char *const env_devmatch = getenv("ACC_OPENCL_DEVMATCH"), *const env_devtype = getenv("ACC_OPENCL_DEVTYPE"); - const char *const env_priority = getenv("ACC_OPENCL_PRIORITY"), *const env_xhints = getenv("ACC_OPENCL_XHINTS"); const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE"), *const env_debug = getenv("ACC_OPENCL_DEBUG"); const char *const env_device = getenv("ACC_OPENCL_DEVICE"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP"); const char *const env_timer = getenv("ACC_OPENCL_TIMER"), *const env_nlocks = getenv("ACC_OPENCL_NLOCKS"); @@ -229,14 +229,20 @@ int c_dbcsr_acc_init(void) { # endif const char *const env_neo = getenv("NEOReadDebugKeys"), *const env_wa = getenv("ACC_OPENCL_WA"); const int neo = (NULL == env_neo ? 1 : atoi(env_neo)); +# if defined(ACC_OPENCL_XHINTS) + const char* const env_xhints = (ACC_OPENCL_XHINTS); + const int xhints_default = 1 + 2 + 4 + 8; +# else + const char* const env_xhints = NULL; + const int xhints_default = 0; +# endif # if defined(ACC_OPENCL_ASYNC) const char* const env_async = (ACC_OPENCL_ASYNC); - const int async_default = 3; + const int async_default = 1 + 2; # else const char* const env_async = NULL; const int async_default = 0; # endif - const char* const env_devsplit = getenv("ACC_OPENCL_DEVSPLIT"); /*const char* const env_nranks = getenv("MPI_LOCALNRANKS"); const cl_uint nranks = LIBXSMM_MAX(NULL != env_nranks ? atoi(env_nranks) : 1, 1);*/ const cl_int devsplit = (NULL == env_devsplit ? /*(1 < nranks ? -1 : 0)*/ 0 : atoi(env_devsplit)); @@ -274,11 +280,11 @@ int c_dbcsr_acc_init(void) { : c_dbcsr_acc_opencl_config.lock_main); c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose)); c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority)); - c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? (1 + 2) : atoi(env_xhints)); + c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? xhints_default : atoi(env_xhints)); c_dbcsr_acc_opencl_config.async = (NULL == env_async ? async_default : atoi(env_async)); c_dbcsr_acc_opencl_config.dump = (NULL == env_dump ? /*default*/ 0 : atoi(env_dump)); c_dbcsr_acc_opencl_config.debug = (NULL == env_debug ? c_dbcsr_acc_opencl_config.dump : atoi(env_debug)); - c_dbcsr_acc_opencl_config.wa = neo * (NULL == env_wa ? ((1 != devsplit ? 0 : 4) + (8 + 16) + (32 + 64)) : atoi(env_wa)); + c_dbcsr_acc_opencl_config.wa = neo * (NULL == env_wa ? ((1 != devsplit ? 0 : 4) + (8 + 16 + 32)) : atoi(env_wa)); assert(EXIT_SUCCESS == result); if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(NULL /*device*/, env_devmatch, &c_dbcsr_acc_opencl_config.devmatch)) { c_dbcsr_acc_opencl_config.devmatch = 1; @@ -338,18 +344,22 @@ int c_dbcsr_acc_init(void) { # if defined(ACC_OPENCL_NCCS) if ((1 & c_dbcsr_acc_opencl_config.wa) && 0 != nccs && NULL == getenv("ZEX_NUMBER_OF_CCS")) { static char zex_nccs[ACC_OPENCL_MAXNDEVS * 8 + 32] = "ZEX_NUMBER_OF_CCS="; + const int mode = ((1 == nccs || 2 == nccs) ? nccs : 4); int j = strlen(zex_nccs); for (i = 0; i < ACC_OPENCL_MAXNDEVS; ++i) { const char* const istr = (0 < i ? ",%u:%i" : "%u:%i"); - const int n = LIBXSMM_SNPRINTF(zex_nccs + j, sizeof(zex_nccs) - j, istr, i, LIBXSMM_CLMP(nccs, 1, 4)); + const int n = LIBXSMM_SNPRINTF(zex_nccs + j, sizeof(zex_nccs) - j, istr, i, mode); if (0 < n) j += n; else { j = 0; break; } } - /* environment is populated before touching the compute runtime */ - if (0 < j) ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(zex_nccs)); /* soft-error */ + if (0 < j && 0 == LIBXSMM_PUTENV(zex_nccs) && /* populate before touching the compute runtime */ + (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity)) + { + fprintf(stderr, "INFO ACC/OpenCL: support multiple separate compute command streamers (%i-CCS mode)\n", mode); + } } assert(EXIT_SUCCESS == result); # endif @@ -373,9 +383,9 @@ int c_dbcsr_acc_init(void) { } assert(EXIT_SUCCESS == result); # endif - if (~(1 + 2) & c_dbcsr_acc_opencl_config.wa) { /* environment is populated before touching the compute runtime */ - static char* key_value[] = {"NEOReadDebugKeys=1", "ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE", "EnableRecoverablePageFaults=0", - "DirectSubmissionOverrideBlitterSupport=0"}; + if (~(1 + 2 + 32) & c_dbcsr_acc_opencl_config.wa) { /* environment is populated before touching the compute runtime */ + static char a[] = "NEOReadDebugKeys=1", b[] = "ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE", c[] = "EnableRecoverablePageFaults=0"; + static char d[] = "DirectSubmissionOverrideBlitterSupport=0", *key_value[] = {a, b, c, d}; if (NULL == env_neo) ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(key_value[0])); if ((4 & c_dbcsr_acc_opencl_config.wa) && NULL == getenv("ZE_FLAT_DEVICE_HIERARCHY")) { ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(key_value[1])); @@ -537,7 +547,7 @@ int c_dbcsr_acc_init(void) { c_dbcsr_acc_opencl_config.devices[0] = c_dbcsr_acc_opencl_config.devices[i]; } c_dbcsr_acc_opencl_config.ndevices = 1; - device_id = (int)i; + device_id = 0; break; } else if (CL_DEVICE_TYPE_ALL == type && NULL == env_devtype /*&& CL_DEVICE_TYPE_GPU == itype*/ && device_id <= (int)i) { @@ -641,6 +651,11 @@ int c_dbcsr_acc_init(void) { result = c_dbcsr_acc_opencl_set_active_device(NULL /*lock*/, ACC_OPENCL_ACTIVATE); } else { + const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK")); + const int rank = (NULL != env_rank ? atoi(env_rank) : 0); + if (0 < rank && 1 < c_dbcsr_acc_opencl_config.ndevices) { + device_id = rank % c_dbcsr_acc_opencl_config.ndevices; + } result = c_dbcsr_acc_opencl_set_active_device(NULL /*lock*/, device_id); } # else @@ -843,6 +858,9 @@ int c_dbcsr_acc_opencl_device_name( cl_device_id device, char name[], size_t name_maxlen, char platform[], size_t platform_maxlen, int cleanup) { int result_name = 0, result_platform = 0; assert(NULL != name || NULL != platform); + if (NULL == device && 0 < c_dbcsr_acc_opencl_config.ndevices) { + device = c_dbcsr_acc_opencl_config.devices[0]; /* NULL-device refers to device 0 */ + } if (NULL != name && 0 != name_maxlen) { result_name = clGetDeviceInfo(device, CL_DEVICE_NAME, name_maxlen, name, NULL); if (0 != cleanup && EXIT_SUCCESS == result_name) { @@ -996,158 +1014,162 @@ int c_dbcsr_acc_opencl_create_context(cl_device_id active_id, cl_context* contex int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_id) { - /* accessing devices is thread-safe (array is fixed after initialization) */ - const cl_device_id active_id = - ((0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) ? c_dbcsr_acc_opencl_config.devices[device_id] : NULL); int result = EXIT_SUCCESS; assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS); - if (NULL != active_id) { - cl_device_id context_id = NULL; - cl_context context = NULL; - if (NULL != lock) ACC_OPENCL_ACQUIRE(lock); - context = c_dbcsr_acc_opencl_config.device.context; - context_id = c_dbcsr_acc_opencl_config.device.id; - if (NULL != context) { - assert(NULL != context_id); - if (active_id != context_id) { + if (0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) { + /* accessing devices is thread-safe (array is fixed after initialization) */ + const cl_device_id active_id = c_dbcsr_acc_opencl_config.devices[device_id]; + if (NULL != active_id) { + cl_device_id context_id = NULL; + cl_context context = NULL; + if (NULL != lock) ACC_OPENCL_ACQUIRE(lock); + context = c_dbcsr_acc_opencl_config.device.context; + context_id = c_dbcsr_acc_opencl_config.device.id; + if (NULL != context) { + assert(NULL != context_id); + if (active_id != context_id) { # if defined(CL_VERSION_1_2) - ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseDevice(context_id)); + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseDevice(context_id)); # endif - result = clReleaseContext(context); - context_id = NULL; - context = NULL; + result = clReleaseContext(context); + context_id = NULL; + context = NULL; + } } - } - assert(NULL == context_id || active_id == context_id); - if (EXIT_SUCCESS == result && active_id != context_id) { - result = c_dbcsr_acc_opencl_create_context(active_id, &context); - assert(NULL != context || EXIT_SUCCESS != result); - } - if (EXIT_SUCCESS == result && active_id != context_id) { /* update/cache device-specific information */ - if (NULL != c_dbcsr_acc_opencl_config.device.stream.queue) { /* release private stream */ - ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseCommandQueue(c_dbcsr_acc_opencl_config.device.stream.queue)); + assert(NULL == context_id || active_id == context_id); + if (EXIT_SUCCESS == result && active_id != context_id) { + result = c_dbcsr_acc_opencl_create_context(active_id, &context); + assert(NULL != context || EXIT_SUCCESS != result); } - memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device)); - result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device.std_clevel, - c_dbcsr_acc_opencl_config.device.std_level, c_dbcsr_acc_opencl_config.device.std_flag, - &c_dbcsr_acc_opencl_config.device.type); - if (EXIT_SUCCESS == result) { - char devname[ACC_OPENCL_BUFFERSIZE] = ""; - const char* const sgexts[] = {"cl_intel_required_subgroup_size", "cl_intel_subgroups", "cl_khr_subgroups"}; - size_t sgsizes[16], nbytes = 0, sgmin = (size_t)-1, i; + if (EXIT_SUCCESS == result && active_id != context_id) { /* update/cache device-specific information */ + if (NULL != c_dbcsr_acc_opencl_config.device.stream.queue) { /* release private stream */ + ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseCommandQueue(c_dbcsr_acc_opencl_config.device.stream.queue)); + } + memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device)); + result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device.std_clevel, + c_dbcsr_acc_opencl_config.device.std_level, c_dbcsr_acc_opencl_config.device.std_flag, + &c_dbcsr_acc_opencl_config.device.type); + if (EXIT_SUCCESS == result) { + char devname[ACC_OPENCL_BUFFERSIZE] = ""; + const char* const sgexts[] = {"cl_intel_required_subgroup_size", "cl_intel_subgroups", "cl_khr_subgroups"}; + size_t sgsizes[16], nbytes = 0, sgmin = (size_t)-1, i; # if defined(ACC_OPENCL_CMDAGR) - ACC_OPENCL_STREAM_PROPERTIES_TYPE properties[4] = { - CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */ - }; + ACC_OPENCL_STREAM_PROPERTIES_TYPE properties[4] = { + CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */ + }; # endif # if defined(ACC_OPENCL_MEM_DEVPTR) - cl_platform_id platform = NULL; - cl_bitfield bitfield = 0; + cl_platform_id platform = NULL; + cl_bitfield bitfield = 0; # endif - c_dbcsr_acc_opencl_config.device.intel = (EXIT_SUCCESS == - c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/)); - c_dbcsr_acc_opencl_config.device.nv = (EXIT_SUCCESS == - c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/)); + c_dbcsr_acc_opencl_config.device.intel = (EXIT_SUCCESS == + c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/)); + c_dbcsr_acc_opencl_config.device.nv = (EXIT_SUCCESS == + c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/)); - if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name( - active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1) || - EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device.uid)) - { - c_dbcsr_acc_opencl_config.device.uid = (cl_uint)-1; - } - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) || - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/)) - { - c_dbcsr_acc_opencl_config.device.amd = 1; - if ('\0' != *devname) { - const char* const gfxname = LIBXSMM_STRISTR(devname, "gfx"); - if (NULL != gfxname && 90 <= atoi(gfxname + 3)) { - c_dbcsr_acc_opencl_config.device.amd = 2; + if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, + 0 /*platform_maxlen*/, /*cleanup*/ 1) || + EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device.uid)) + { + c_dbcsr_acc_opencl_config.device.uid = (cl_uint)-1; + } + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) || + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/)) + { + c_dbcsr_acc_opencl_config.device.amd = 1; + if ('\0' != *devname) { + const char* const gfxname = LIBXSMM_STRISTR(devname, "gfx"); + if (NULL != gfxname && 90 <= atoi(gfxname + 3)) { + c_dbcsr_acc_opencl_config.device.amd = 2; + } } } - } - if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool) /*cl_int*/, - &c_dbcsr_acc_opencl_config.device.unified, NULL)) - { - c_dbcsr_acc_opencl_config.device.unified = CL_FALSE; - } - if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), - c_dbcsr_acc_opencl_config.device.wgsize, NULL)) - { - c_dbcsr_acc_opencl_config.device.wgsize[0] = 1; - } - if (EXIT_SUCCESS != clGetDeviceInfo(active_id, 4199 /*CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE*/, sizeof(size_t), - c_dbcsr_acc_opencl_config.device.wgsize + 1, NULL)) /* CL_VERSION_3_0 */ - { - c_dbcsr_acc_opencl_config.device.wgsize[1] = 1; - } - assert(0 == c_dbcsr_acc_opencl_config.device.wgsize[2]); - if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_id, sgexts, 2) && - EXIT_SUCCESS == - clGetDeviceInfo(active_id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, sizeof(sgsizes), sgsizes, &nbytes)) - { - for (i = 0; (i * sizeof(size_t)) < nbytes; ++i) { - const size_t sgsize = sgsizes[i]; - if (sgsize < sgmin) sgmin = sgsize; - if (0 == (sgsize % c_dbcsr_acc_opencl_config.device.wgsize[1]) && c_dbcsr_acc_opencl_config.device.wgsize[2] < sgsize) { - if (c_dbcsr_acc_opencl_config.device.wgsize[1] < sgsize) c_dbcsr_acc_opencl_config.device.wgsize[1] = sgsize; - c_dbcsr_acc_opencl_config.device.wgsize[2] = sgsize; + if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool) /*cl_int*/, + &c_dbcsr_acc_opencl_config.device.unified, NULL)) + { + c_dbcsr_acc_opencl_config.device.unified = CL_FALSE; + } + if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), + c_dbcsr_acc_opencl_config.device.wgsize, NULL)) + { + c_dbcsr_acc_opencl_config.device.wgsize[0] = 1; + } + if (EXIT_SUCCESS != clGetDeviceInfo(active_id, 4199 /*CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE*/, sizeof(size_t), + c_dbcsr_acc_opencl_config.device.wgsize + 1, NULL)) /* CL_VERSION_3_0 */ + { + c_dbcsr_acc_opencl_config.device.wgsize[1] = 1; + } + assert(0 == c_dbcsr_acc_opencl_config.device.wgsize[2]); + if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_id, sgexts, 2) && + EXIT_SUCCESS == + clGetDeviceInfo(active_id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, sizeof(sgsizes), sgsizes, &nbytes)) + { + for (i = 0; (i * sizeof(size_t)) < nbytes; ++i) { + const size_t sgsize = sgsizes[i]; + if (sgsize < sgmin) sgmin = sgsize; + if (0 != c_dbcsr_acc_opencl_config.device.wgsize[1] && 0 == (sgsize % c_dbcsr_acc_opencl_config.device.wgsize[1]) && + c_dbcsr_acc_opencl_config.device.wgsize[2] < sgsize) + { + if (c_dbcsr_acc_opencl_config.device.wgsize[1] < sgsize) c_dbcsr_acc_opencl_config.device.wgsize[1] = sgsize; + c_dbcsr_acc_opencl_config.device.wgsize[2] = sgsize; + } } + if (0 != c_dbcsr_acc_opencl_config.device.wgsize[2]) c_dbcsr_acc_opencl_config.device.wgsize[2] = sgmin; + } + else { + c_dbcsr_acc_opencl_config.device.wgsize[2] = 0; + } +# if defined(ACC_OPENCL_XHINTS) && defined(ACC_OPENCL_MEM_DEVPTR) + if (0 != (1 & c_dbcsr_acc_opencl_config.xhints) && 2 <= *c_dbcsr_acc_opencl_config.device.std_level && + 0 != c_dbcsr_acc_opencl_config.device.intel && 0 == c_dbcsr_acc_opencl_config.device.unified && + EXIT_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL) && + EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 2 /*platform vendor*/) && + EXIT_SUCCESS == clGetDeviceInfo(active_id, 0x4191 /*CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL*/, sizeof(cl_bitfield), + &bitfield, NULL) && + 0 != bitfield) /* cl_intel_unified_shared_memory extension */ + { + void* ptr = NULL; + ptr = clGetExtensionFunctionAddressForPlatform(platform, "clSetKernelArgMemPointerINTEL"); + LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clSetKernelArgMemPointerINTEL, &ptr); + ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL"); + LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL, &ptr); + ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL"); + LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL, &ptr); + ptr = clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL"); + LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL, &ptr); + ptr = clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL"); + LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clMemFreeINTEL, &ptr); } - if (0 != c_dbcsr_acc_opencl_config.device.wgsize[2]) c_dbcsr_acc_opencl_config.device.wgsize[2] = sgmin; - } - else { - c_dbcsr_acc_opencl_config.device.wgsize[2] = 0; - } -# if defined(ACC_OPENCL_MEM_DEVPTR) - if (0 != (1 & c_dbcsr_acc_opencl_config.xhints) && 2 <= *c_dbcsr_acc_opencl_config.device.std_level && - 0 != c_dbcsr_acc_opencl_config.device.intel && 0 == c_dbcsr_acc_opencl_config.device.unified && - EXIT_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL) && - EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 2 /*platform vendor*/) && - EXIT_SUCCESS == clGetDeviceInfo(active_id, 0x4191 /*CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL*/, sizeof(cl_bitfield), - &bitfield, NULL) && - 0 != bitfield) /* cl_intel_unified_shared_memory extension */ - { - void* ptr = NULL; - ptr = clGetExtensionFunctionAddressForPlatform(platform, "clSetKernelArgMemPointerINTEL"); - LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clSetKernelArgMemPointerINTEL, &ptr); - ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL"); - LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL, &ptr); - ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL"); - LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL, &ptr); - ptr = clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL"); - LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL, &ptr); - ptr = clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL"); - LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clMemFreeINTEL, &ptr); - } # endif # if defined(ACC_OPENCL_CMDAGR) - if (0 != c_dbcsr_acc_opencl_config.device.intel) { /* device vendor (above) can now be used */ - int result_cmdagr = EXIT_SUCCESS; - const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result_cmdagr); - if (EXIT_SUCCESS == result_cmdagr) { + if (0 != c_dbcsr_acc_opencl_config.device.intel) { /* device vendor (above) can now be used */ + int result_cmdagr = EXIT_SUCCESS; + const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result_cmdagr); + if (EXIT_SUCCESS == result_cmdagr) { # if 0 /* force host-timer? */ - c_dbcsr_acc_opencl_config.timer = c_dbcsr_acc_opencl_timer_host; + c_dbcsr_acc_opencl_config.timer = c_dbcsr_acc_opencl_timer_host; # endif - assert(NULL != q); - clReleaseCommandQueue(q); + assert(NULL != q); + clReleaseCommandQueue(q); + } } - } # endif - properties[1] = 0; - c_dbcsr_acc_opencl_config.device.stream.queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result); - } - if (EXIT_SUCCESS == result) { - if (active_id != context_id) { - assert(active_id != c_dbcsr_acc_opencl_config.device.id); - c_dbcsr_acc_opencl_config.device.context = context; - c_dbcsr_acc_opencl_config.device.id = active_id; + properties[1] = 0; + c_dbcsr_acc_opencl_config.device.stream.queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result); + } + if (EXIT_SUCCESS == result) { + if (active_id != context_id) { + assert(active_id != c_dbcsr_acc_opencl_config.device.id); + c_dbcsr_acc_opencl_config.device.context = context; + c_dbcsr_acc_opencl_config.device.id = active_id; + } + assert(active_id == c_dbcsr_acc_opencl_config.device.id); } - assert(active_id == c_dbcsr_acc_opencl_config.device.id); + else memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device)); } - else memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device)); + if (NULL != lock) ACC_OPENCL_RELEASE(lock); } - if (NULL != lock) ACC_OPENCL_RELEASE(lock); + else result = EXIT_FAILURE; } else result = EXIT_FAILURE; assert(EXIT_SUCCESS == result || NULL == c_dbcsr_acc_opencl_config.device.context); @@ -1156,13 +1178,8 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i int c_dbcsr_acc_set_active_device(int device_id) { + /* avoid ACC_OPENCL_PROFILE in this routine */ int result = EXIT_SUCCESS; -# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) - int routine_handle; - static const char* const routine_name_ptr = LIBXSMM_FUNCNAME; - static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1; - c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle); -# endif if (0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) { # if defined(ACC_OPENCL_CACHE_DID) if (c_dbcsr_acc_opencl_active_id != (device_id + 1)) @@ -1174,12 +1191,7 @@ int c_dbcsr_acc_set_active_device(int device_id) { # endif } } -# if !defined(NDEBUG) else result = EXIT_FAILURE; -# endif -# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) - c_dbcsr_timestop(&routine_handle); -# endif ACC_OPENCL_RETURN(result); } diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h index cd6639983fd..ddd1834051f 100644 --- a/src/acc/opencl/acc_opencl.h +++ b/src/acc/opencl/acc_opencl.h @@ -104,6 +104,9 @@ #if !defined(ACC_OPENCL_ASYNC) && 1 # define ACC_OPENCL_ASYNC getenv("ACC_OPENCL_ASYNC") #endif +#if !defined(ACC_OPENCL_XHINTS) && 1 +# define ACC_OPENCL_XHINTS getenv("ACC_OPENCL_XHINTS") +#endif #if !defined(ACC_OPENCL_STREAM_PRIORITIES) && 0 # if defined(CL_QUEUE_PRIORITY_KHR) # define ACC_OPENCL_STREAM_PRIORITIES @@ -121,7 +124,7 @@ # define ACC_OPENCL_ACTIVATE 0 #endif /* Use DBCSR's profile for detailed timings */ -#if !defined(ACC_OPENCL_PROFILE) && 0 +#if !defined(ACC_OPENCL_PROFILE) && (defined(__OFFLOAD_PROFILING) || 0) # define ACC_OPENCL_PROFILE #endif @@ -359,7 +362,7 @@ typedef struct c_dbcsr_acc_opencl_config_t { extern c_dbcsr_acc_opencl_config_t c_dbcsr_acc_opencl_config; /** Determines host-pointer registration for modification. */ -c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory); +c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(const void* memory); /** Determines device-pointer registration for modification (internal). */ c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_devptr_modify( ACC_OPENCL_LOCKTYPE* lock, void* memory, size_t elsize, const size_t* amount, size_t* offset); diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c index 41fc76519c0..7d9bd86a4d0 100644 --- a/src/acc/opencl/acc_opencl_mem.c +++ b/src/acc/opencl/acc_opencl_mem.c @@ -60,7 +60,7 @@ void c_dbcsr_acc_opencl_pfree(ACC_OPENCL_LOCKTYPE* lock, const void* pointer, vo } -c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory) { +c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(const void* memory) { assert(NULL == memory || sizeof(c_dbcsr_acc_opencl_info_memptr_t) <= (uintptr_t)memory); return (NULL != memory ? (c_dbcsr_acc_opencl_info_memptr_t*)((uintptr_t)memory - sizeof(c_dbcsr_acc_opencl_info_memptr_t)) : (c_dbcsr_acc_opencl_info_memptr_t*)NULL); @@ -168,6 +168,8 @@ int c_dbcsr_acc_opencl_info_devptr( int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) { const size_t size_meminfo = sizeof(c_dbcsr_acc_opencl_info_memptr_t); int result = EXIT_SUCCESS, alignment = sizeof(void*); + cl_mem_flags flags = CL_MEM_ALLOC_HOST_PTR; + void* host_ptr = NULL; cl_mem memory = NULL; # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) int routine_handle; @@ -186,12 +188,25 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) EXIT_SUCCESS == c_dbcsr_acc_opencl_set_active_device(NULL /*lock*/, (int)c_dbcsr_acc_opencl_config.device.uid)); } # endif - memory = clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, CL_MEM_ALLOC_HOST_PTR, nbytes, NULL /*host_ptr*/, &result); +# if defined(ACC_OPENCL_XHINTS) + if (0 != (8 & c_dbcsr_acc_opencl_config.xhints) && (0 != c_dbcsr_acc_opencl_config.device.nv || NULL != (ACC_OPENCL_XHINTS))) { + host_ptr = malloc(nbytes); + if (NULL != host_ptr) flags = CL_MEM_USE_HOST_PTR; + } +# endif + memory = clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, flags, nbytes, host_ptr, &result); if (EXIT_SUCCESS == result) { - const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) - : c_dbcsr_acc_opencl_stream_default()); - void* const mapped = clEnqueueMapBuffer( - str->queue, memory, CL_TRUE /*always block*/, CL_MAP_READ | CL_MAP_WRITE, 0 /*offset*/, nbytes, 0, NULL, NULL, &result); + void* mapped = host_ptr; + if (NULL == host_ptr) { + const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) + : c_dbcsr_acc_opencl_stream_default()); + mapped = clEnqueueMapBuffer(str->queue, memory, CL_TRUE /*always block*/, +# if defined(ACC_OPENCL_XHINTS) && (defined(CL_VERSION_1_2) || defined(CL_MAP_WRITE_INVALIDATE_REGION)) + (4 & c_dbcsr_acc_opencl_config.xhints) ? CL_MAP_WRITE_INVALIDATE_REGION : +# endif + (CL_MAP_READ | CL_MAP_WRITE), + 0 /*offset*/, nbytes, 0, NULL, NULL, &result); + } assert(EXIT_SUCCESS == result || NULL == mapped); if (EXIT_SUCCESS == result) { const uintptr_t address = (uintptr_t)mapped; @@ -210,6 +225,7 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) if (EXIT_SUCCESS != result) { if (NULL != memory) ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseMemObject(memory)); *host_mem = NULL; + free(host_ptr); } assert(EXIT_SUCCESS == result || NULL == *host_mem); # if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) @@ -231,13 +247,25 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) { c_dbcsr_acc_opencl_info_memptr_t* const meminfo = c_dbcsr_acc_opencl_info_hostptr(host_mem); if (NULL != meminfo->memory) { const c_dbcsr_acc_opencl_info_memptr_t info = *meminfo; /* copy meminfo prior to unmap */ - const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) - : c_dbcsr_acc_opencl_stream_default()); + void* host_ptr = NULL; int result_release; - cl_event event; - assert(NULL != str && NULL != str->queue); - result = clEnqueueUnmapMemObject(str->queue, info.memory, info.memptr, 0, NULL, &event); - if (NULL == stream && EXIT_SUCCESS == result) result = clWaitForEvents(1, &event); +# if defined(ACC_OPENCL_XHINTS) + if (0 != (8 & c_dbcsr_acc_opencl_config.xhints) && + (0 != c_dbcsr_acc_opencl_config.device.nv || NULL != (ACC_OPENCL_XHINTS)) && + EXIT_SUCCESS == clGetMemObjectInfo(info.memory, CL_MEM_HOST_PTR, sizeof(void*), &host_ptr, NULL) && NULL != host_ptr) + { + free(host_ptr); + } + if (NULL == host_ptr) +# endif + { + const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) + : c_dbcsr_acc_opencl_stream_default()); + cl_event event; + assert(NULL != str && NULL != str->queue); + result = clEnqueueUnmapMemObject(str->queue, info.memory, info.memptr, 0, NULL, &event); + if (NULL == stream && EXIT_SUCCESS == result) result = clWaitForEvents(1, &event); + } result_release = clReleaseMemObject(info.memory); if (EXIT_SUCCESS == result) result = result_release; } @@ -256,14 +284,14 @@ int c_dbcsr_acc_opencl_memcpy_d2h( int c_dbcsr_acc_opencl_memcpy_d2h( cl_mem dev_mem, void* host_mem, size_t offset, size_t nbytes, cl_command_queue queue, int blocking) { # if defined(ACC_OPENCL_ASYNC) - const cl_bool finish = (0 != blocking || 0 == (2 & c_dbcsr_acc_opencl_config.async) || - (0 != c_dbcsr_acc_opencl_config.device.nv && NULL == (ACC_OPENCL_ASYNC))); + const cl_bool finish = (0 != blocking || 0 == (2 & c_dbcsr_acc_opencl_config.async)); # else const cl_bool finish = CL_TRUE; # endif int result = EXIT_SUCCESS; # if defined(ACC_OPENCL_MEM_DEVPTR) if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) { + assert(0 == c_dbcsr_acc_opencl_config.device.unified); result = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(queue, finish, host_mem, dev_mem, nbytes, 0, NULL, NULL); } else @@ -275,6 +303,7 @@ int c_dbcsr_acc_opencl_memcpy_d2h( int result_sync = EXIT_SUCCESS; # if defined(ACC_OPENCL_MEM_DEVPTR) if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) { + assert(0 == c_dbcsr_acc_opencl_config.device.unified); result_sync = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(queue, CL_TRUE, host_mem, dev_mem, nbytes, 0, NULL, NULL); } else @@ -316,6 +345,7 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) { assert(NULL != dev_mem && NULL != context); # if defined(ACC_OPENCL_MEM_DEVPTR) if (NULL != c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL) { + assert(0 == c_dbcsr_acc_opencl_config.device.unified); *dev_mem = memptr = c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL( context, c_dbcsr_acc_opencl_config.device.id, NULL /*properties*/, nbytes, 0 /*alignment*/, &result); if (EXIT_SUCCESS != result) *dev_mem = NULL; @@ -409,6 +439,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) { # else assert(NULL != c_dbcsr_acc_opencl_config.device.context); if (NULL != c_dbcsr_acc_opencl_config.device.clMemFreeINTEL) { + assert(0 == c_dbcsr_acc_opencl_config.device.unified); result = c_dbcsr_acc_opencl_config.device.clMemFreeINTEL(c_dbcsr_acc_opencl_config.device.context, dev_mem); } else { @@ -479,14 +510,14 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream(NULL /*lock*/, ACC_OPENCL_OMP_TID())); # if defined(ACC_OPENCL_ASYNC) - const cl_bool finish = (0 == (1 & c_dbcsr_acc_opencl_config.async) || NULL == stream || - (0 != c_dbcsr_acc_opencl_config.device.nv && NULL == (ACC_OPENCL_ASYNC))); + const cl_bool finish = (0 == (1 & c_dbcsr_acc_opencl_config.async) || NULL == stream); # else const cl_bool finish = CL_TRUE; # endif assert(NULL != str && NULL != str->queue); # if defined(ACC_OPENCL_MEM_DEVPTR) if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) { + assert(0 == c_dbcsr_acc_opencl_config.device.unified); result = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(str->queue, finish, dev_mem, host_mem, nbytes, 0, NULL, NULL); } else @@ -566,6 +597,7 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt # if defined(ACC_OPENCL_MEM_DEVPTR) assert(NULL != c_dbcsr_acc_opencl_config.device.context); if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) { + assert(0 == c_dbcsr_acc_opencl_config.device.unified); result = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL( str->queue, CL_FALSE /*blocking*/, devmem_dst, devmem_src, nbytes, 0, NULL, &event); } @@ -616,6 +648,7 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb # if defined(ACC_OPENCL_MEM_DEVPTR) assert(NULL != c_dbcsr_acc_opencl_config.device.context); if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL) { + assert(0 == c_dbcsr_acc_opencl_config.device.unified); result = c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL( str->queue, (char*)dev_mem + offset, &value, size_of_value, nbytes, 0, NULL, &event); } diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c index 41297015ba8..29ade32dba8 100644 --- a/src/acc/opencl/acc_opencl_stream.c +++ b/src/acc/opencl/acc_opencl_stream.c @@ -117,6 +117,7 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { if (NULL != c_dbcsr_acc_opencl_config.device.context) # endif { +# if defined(ACC_OPENCL_XHINTS) if ((2 & c_dbcsr_acc_opencl_config.xhints) && 0 != c_dbcsr_acc_opencl_config.device.intel) { /* enable queue families */ struct { cl_command_queue_properties properties; @@ -141,6 +142,7 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) { } } } +# endif if ((c_dbcsr_acc_opencl_timer_device == c_dbcsr_acc_opencl_config.timer) && (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity)) { diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c index 97a0e84a891..409659980cb 100644 --- a/src/acc/opencl/smm/opencl_libsmm.c +++ b/src/acc/opencl/smm/opencl_libsmm.c @@ -28,15 +28,20 @@ libxsmm_gemm_descriptor_dinit(BLOB, PREC, M, N, K, LDA, LDB, LDC, 1.0, 1.0, FLAGS, PREFETCH) # endif -# if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && defined(OPENCL_LIBSMM_VALIDATE) && \ - (1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE) -# define OPENCL_LIBSMM_VALIDATE_TRANS -# endif -# if !defined(OPENCL_LIBSMM_VALIDATE_SMM) && defined(OPENCL_LIBSMM_VALIDATE) -# define OPENCL_LIBSMM_VALIDATE_SMM -# endif -# if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && defined(OPENCL_LIBSMM_VALIDATE) && 1 -# define OPENCL_LIBSMM_VALIDATE_EXIT +# if defined(OPENCL_LIBSMM_VALIDATE) +# if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && (1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE) +# define OPENCL_LIBSMM_VALIDATE_TRANS +# endif +# if !defined(OPENCL_LIBSMM_VALIDATE_SMM) +# define OPENCL_LIBSMM_VALIDATE_SMM +# endif +# if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && 1 +# define OPENCL_LIBSMM_VALIDATE_EXIT +# endif +# if !defined(OPENCL_LIBSMM_VALIDATE_SCRATCH) +# define OPENCL_LIBSMM_VALIDATE_SCRATCH(SIZE, ALIGN) /*libxsmm_aligned_scratch(SIZE, ALIGN)*/ malloc(SIZE) +# define OPENCL_LIBSMM_VALIDATE_FREE(PTR) /*libxsmm_free(PTR)*/ free(PTR) +# endif # endif # if !defined(OPENCL_LIBSMM_KERNELNAME_TRANS) # define OPENCL_LIBSMM_KERNELNAME_TRANS "trans" @@ -111,31 +116,6 @@ int opencl_libsmm_use_cmem(cl_device_id device) { } -# if defined(OPENCL_LIBSMM_VALIDATE) && (0 != OPENCL_LIBSMM_VALIDATE) -void opencl_libsmm_print_matrix(FILE* ostream, const char* label, libsmm_acc_data_t type, const void* mat, int m, int n) { - int i, j; - const char* const s = (NULL != label ? label : ""); - const int len = (int)strlen(s); - for (i = 0; i < m; ++i) { - if (0 < i) { - fprintf(ostream, "%*s", len, " "); - } - else { - fprintf(ostream, "%s", s); - } - for (j = 0; j < n; ++j) { - switch (type) { - case dbcsr_type_real_8: fprintf(ostream, "%.2f ", ((const double*)mat)[i * n + j]); break; - case dbcsr_type_real_4: fprintf(ostream, "%.2f ", ((const float*)mat)[i * n + j]); break; - default: fprintf(ostream, "? "); - } - } - fprintf(ostream, "\n"); - } -} -# endif - - int opencl_libsmm_write_trans_params(FILE* stream, int only_key, const opencl_libsmm_transkey_t* key, const opencl_libsmm_trans_t* config, const char* delim, const char* begin, const char* close) { int result = 0; @@ -209,7 +189,7 @@ int opencl_libsmm_read_smm_params(char* parambuf, opencl_libsmm_smmkey_t* key, o LIBXSMM_MEMZERO127(key); /* potentially heterogeneous key-data (alignment gaps) */ memset(value, 0, sizeof(opencl_libsmm_smm_t)); for (; NULL != s; - ++i, s = (c != consumed ? ((s + 1) < end ? strtok((s + 1) + strlen(s), ACC_OPENCL_DELIMS) : NULL) : s), c = consumed) + ++i, s = (c != consumed ? ((s + 1) < end ? strtok((s + 1) + strlen(s), ACC_OPENCL_DELIMS) : NULL) : s), c = consumed) { switch (i) { case 0: @@ -521,7 +501,9 @@ int libsmm_acc_init(void) { memcpy(config_init, &config, sizeof(config)); } # if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER - if (active_match == i && c_dbcsr_acc_opencl_config.device.uid != key.devuid) { + if (active_match == i && 0 != c_dbcsr_acc_opencl_config.device.uid && + c_dbcsr_acc_opencl_config.device.uid != key.devuid) + { key.devuid = c_dbcsr_acc_opencl_config.device.uid; config_init = (opencl_libsmm_smm_t*)libxsmm_xdispatch(&key, sizeof(key)); if (NULL == config_init && NULL != libxsmm_xregister(&key, sizeof(key), sizeof(config), &config)) { @@ -786,7 +768,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v const size_t scratch_size = (sizeof(int) * offset_stack_size) /*stack*/ + data_size /*imat*/ + data_size /*omat*/ + (mn * typesize) /*gold*/ + 3 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/; - scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT); + scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT); if (NULL != scratch) { stack = (int*)scratch; imat = (char*)LIBXSMM_UP2((uintptr_t)stack + sizeof(int) * offset_stack_size, LIBXSMM_ALIGNMENT); @@ -855,20 +837,15 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v } # if defined(OPENCL_LIBSMM_VALIDATE_TRANS) ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream), "transfer validation test", result); -# endif -# if defined(OPENCL_LIBSMM_VALIDATE_TRANS) ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result); -# endif -# if defined(OPENCL_LIBSMM_VALIDATE_TRANS) if (EXIT_SUCCESS == result) { - int i, j; - LIBXSMM_STDIO_ACQUIRE(); + char print_buffer[2048] = ""; + int print_offset = 0, i, j; if (0 != c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, - "libsmm_acc_transpose(" - "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", - offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, - max_kernel_dim, stream); + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, + "libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size, + dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim, + stream); } for (i = offset; i < offset_stack_size; ++i) { const size_t index = stack[i]; @@ -879,20 +856,12 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v libxsmm_itrans(gold, typesize, m, n, m, n); if (0 != memcmp(gold, test, mn * typesize)) { if (0 == c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, - "libsmm_acc_transpose(" - "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", - offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, - n, max_kernel_dim, stream); - } - fprintf(stderr, " => ERROR\n"); - if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, "stackposition = %i (index=%llu)\n", i, (unsigned long long)index); - opencl_libsmm_print_matrix(stderr, "orig = ", datatype, orig, m, n); - opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold, n, m); - opencl_libsmm_print_matrix(stderr, "test = ", datatype, test, n, m); - fprintf(stderr, "\n"); + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, + "libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size, + dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim, + stream); } + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n"); # if defined(OPENCL_LIBSMM_VALIDATE_EXIT) exit(EXIT_FAILURE); # else @@ -903,7 +872,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v for (j = offset; j < i; ++j) { const size_t duplicate = stack[j]; if (index == duplicate) { - fprintf(stderr, " => ERROR\n"); + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n"); # if defined(OPENCL_LIBSMM_VALIDATE_EXIT) exit(EXIT_FAILURE); # else @@ -915,8 +884,10 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v } } if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) { - fprintf(stderr, " => OK\n"); + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n"); } + LIBXSMM_STDIO_ACQUIRE(); + fputs(print_buffer, stderr); LIBXSMM_STDIO_RELEASE(); } libxsmm_free(scratch); @@ -1113,7 +1084,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, (NULL == env_nz || '\0' == *env_nz) ? (0 != defaults ? /*default*/ 0 : config->nz) : atoi(env_nz), 0, 1); new_config.al = LIBXSMM_CLMP(/* bug: AL=1 */ (NULL == env_al || '\0' == *env_al) - ? (0 == (32 & c_dbcsr_acc_opencl_config.wa) ? (0 != defaults ? 0 : config->al) : 0) + ? (0 == (64 & c_dbcsr_acc_opencl_config.wa) ? (0 != defaults ? 0 : config->al) : 0) : atoi(env_al), 0, 1); new_config.tb = LIBXSMM_CLMP( @@ -1124,7 +1095,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, (NULL == env_ap || '\0' == *env_ap) ? (0 != defaults ? /*default*/ 0 : config->ap) : atoi(env_ap), 0, 1); new_config.aa = LIBXSMM_CLMP(/* bug: AA=2 XF=1 */ (NULL == env_aa || '\0' == *env_aa) ? (0 != defaults ? default_aa : config->aa) : atoi(env_aa), 0, - (0 == (64 & c_dbcsr_acc_opencl_config.wa) || 0 == new_config.flags) ? 2 : 1); + (0 == (32 & c_dbcsr_acc_opencl_config.wa) || 0 == new_config.flags) ? 2 : 1); new_config.ab = LIBXSMM_CLMP( (NULL == env_ab || '\0' == *env_ab) ? (0 != defaults ? default_ab : config->ab) : atoi(env_ab), 0, 2); new_config.ac = LIBXSMM_CLMP( @@ -1342,7 +1313,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, &blob, precision, m_max, n_max, k_max, m_max, k_max, m_max, LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE); const size_t scratch_size = psize + asize + bsize + csize + csize + k_max * n_max * typesize + 5 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/; - scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT); + scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT); if (NULL != desc && NULL != scratch) { pinp = (int*)scratch; ainp = (char*)LIBXSMM_UP2((uintptr_t)pinp + psize, LIBXSMM_ALIGNMENT); @@ -1429,10 +1400,12 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, const char* const env_tol = getenv("OPENCL_LIBSMM_SMM_TOLERANCE"); const double tolerance = ((NULL == env_tol || '\0' == *env_tol) ? 1E-3 : atof(env_tol)); const int* const params = pinp + (4 <= nparams ? (nparams - 4) : 0); + char print_buffer[2048] = ""; + int print_offset = 0; size_t i; - LIBXSMM_STDIO_ACQUIRE(); if (0 != c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size, + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, + "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max, max_kernel_dim, stream); } @@ -1458,20 +1431,21 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, # endif if (tolerance < epsilon) { if (0 == c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size, + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, + "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max, max_kernel_dim, stream); } # if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER - fprintf(stderr, " => ERROR diff=%g (%g != %g)\n", diff.linf_abs, diff.v_ref, diff.v_tst); -# else - fprintf(stderr, " => ERROR diff=%g\n", diff.linf_abs); + if (LIBXSMM_NOTNAN(diff.v_tst)) { + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, + " => ERROR diff=%g (|%g-%g|=%g)\n", epsilon, diff.v_ref, diff.v_tst, diff.linf_abs); + } + else # endif - if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) { - fprintf(stderr, "stackposition = %llu (index=%llu)\n", (unsigned long long)i, (unsigned long long)ic); - opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold + ic, m_max, n_max); - opencl_libsmm_print_matrix(stderr, "test = ", datatype, test + ic, m_max, n_max); - fprintf(stderr, "\n"); + { + print_offset += LIBXSMM_SNPRINTF( + print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR diff=%g\n", epsilon); } # if defined(OPENCL_LIBSMM_VALIDATE_EXIT) exit(EXIT_FAILURE); @@ -1482,8 +1456,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, } } if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) { - fprintf(stderr, " => OK\n"); + print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n"); } + LIBXSMM_STDIO_ACQUIRE(); + fputs(print_buffer, stderr); LIBXSMM_STDIO_RELEASE(); } libxsmm_free(scratch); diff --git a/src/acc/opencl/smm/params/tune_multiply_GH200.csv b/src/acc/opencl/smm/params/tune_multiply_GH200.csv new file mode 100644 index 00000000000..7275e0e771f --- /dev/null +++ b/src/acc/opencl/smm/params/tune_multiply_GH200.csv @@ -0,0 +1,317 @@ +DEVICE;TYPEID;M;N;K;S;GFLOPS;BS;BM;BN;BK;WS;WG;LU;NZ;AL;TB;TC;AP;AA;AB;AC +NVIDIA GH200 480GB [0x3528];3;2;2;2;30000;0;11;2;1;2;2;1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;3;3;3;30000;0;12;3;1;2;3;1;0;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;4;30000;0;14;4;1;2;4;-1;1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;5;30000;0;13;4;1;2;4;1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;7;30000;0;12;4;1;3;4;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;9;30000;0;12;4;1;4;4;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;10;30000;0;5;4;1;4;4;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;13;30000;0;4;4;1;3;4;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;15;30000;0;4;4;1;3;4;0;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;17;30000;0;4;4;1;4;4;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;25;30000;0;19;4;1;3;4;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;26;30000;0;3;4;1;2;4;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;28;30000;0;3;4;1;2;4;-1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;4;32;30000;0;3;4;1;2;4;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;4;30000;0;15;4;1;2;5;0;0;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;5;30000;0;13;4;1;2;5;0;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;7;30000;0;13;4;1;4;5;1;-1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;9;30000;0;10;4;1;2;5;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;13;30000;0;5;4;1;2;5;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;17;30000;0;4;4;1;4;5;-1;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;25;30000;0;12;4;1;2;5;-1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;5;32;30000;0;3;4;1;4;5;-1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;7;4;30000;0;15;4;1;3;7;-2;-1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;7;5;30000;0;12;4;1;2;7;0;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;7;7;30000;0;13;4;1;2;7;0;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;7;9;30000;0;5;4;1;4;7;1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;7;13;30000;0;4;4;1;4;7;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;9;4;30000;0;13;4;1;3;9;1;-2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;9;5;30000;0;13;4;1;2;9;-2;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;9;7;30000;0;13;4;1;4;9;0;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;9;9;30000;0;10;4;1;2;9;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;9;13;30000;0;5;4;1;4;9;-2;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;10;4;30000;0;13;4;1;4;10;1;0;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;10;10;30000;0;10;4;1;4;10;1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;13;4;30000;0;16;4;1;2;13;0;-1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;13;5;30000;0;13;4;1;2;13;1;-2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;13;7;30000;0;15;4;1;1;1;0;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;13;9;30000;0;10;4;1;2;1;0;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;13;13;30000;0;12;4;1;4;1;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;13;17;30000;0;10;4;1;4;1;-2;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;13;32;30000;0;12;4;1;2;1;-1;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;15;4;30000;0;15;4;1;1;1;1;0;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;17;4;30000;0;16;4;1;3;1;1;-1;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;17;5;30000;0;13;4;1;2;1;0;-1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;17;13;30000;0;14;4;1;2;1;-1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;17;17;30000;0;9;4;1;4;1;-1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;17;32;30000;0;12;4;1;4;1;1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;25;4;30000;0;14;4;1;3;1;0;-1;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;25;5;30000;0;15;4;1;2;25;1;-2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;26;4;30000;0;17;4;1;3;1;-1;-1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;28;4;30000;0;14;4;1;3;1;-1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;32;4;30000;0;18;4;1;1;1;0;-2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;4;32;5;30000;0;14;4;1;2;1;-2;-2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;32;13;30000;0;11;4;1;2;1;1;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;32;17;30000;0;8;4;1;2;1;0;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;4;32;32;30000;0;13;4;1;4;1;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;4;30000;0;14;5;1;2;1;-1;-1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;5;30000;0;13;5;1;4;1;-2;-1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;7;30000;0;13;5;1;2;1;0;-2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;9;30000;0;11;5;1;1;1;-2;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;13;30000;0;12;5;1;4;5;-1;-1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;17;30000;0;12;5;1;2;5;1;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;25;30000;0;12;5;1;4;5;0;-1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;4;32;30000;0;12;5;1;2;5;-1;1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;4;30000;0;15;5;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;5;30000;0;12;5;1;2;1;1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;7;30000;0;12;5;1;3;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;9;30000;0;12;5;1;2;1;-2;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;13;30000;0;5;5;1;4;1;-2;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;17;30000;0;4;5;1;4;1;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;28;30000;0;19;5;1;3;5;-1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;5;32;30000;0;3;5;1;3;1;1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;7;4;30000;0;15;5;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;7;5;30000;0;13;5;1;4;1;-2;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;7;7;30000;0;10;5;1;2;1;-1;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;7;9;30000;0;10;5;1;2;1;0;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;7;13;30000;0;10;5;1;3;1;-1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;9;4;30000;0;16;5;1;2;1;-1;1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;9;5;30000;0;13;5;1;2;1;1;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;9;7;30000;0;9;5;1;2;1;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;9;9;30000;0;10;5;1;2;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;13;4;30000;0;16;5;1;2;1;0;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;13;5;30000;0;15;5;1;2;1;1;-2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;13;7;30000;0;8;5;1;3;1;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;13;13;30000;0;10;5;1;2;1;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;13;17;30000;0;12;5;1;2;1;0;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;13;28;30000;0;5;5;1;2;13;-2;-1;0;1;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;13;32;30000;0;4;5;1;5;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;17;4;30000;0;18;5;1;2;1;0;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;17;5;30000;0;13;5;1;5;1;-2;-1;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;17;13;30000;0;10;5;1;2;1;-2;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;17;17;30000;0;16;5;1;2;1;-2;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;17;32;30000;0;10;5;1;2;1;-1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;25;4;30000;0;18;5;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;32;4;30000;0;18;5;1;2;1;1;0;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;5;32;5;30000;0;15;5;1;2;1;1;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;32;13;30000;0;11;5;1;4;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;32;17;30000;0;11;5;1;5;1;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;5;32;32;30000;0;12;5;1;4;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;6;6;6;30000;0;10;6;1;2;1;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;6;6;7;30000;0;12;6;1;6;1;-1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;6;6;8;30000;0;12;6;1;3;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;6;7;6;30000;0;8;6;1;2;1;-2;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;6;7;7;30000;0;10;6;1;2;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;6;7;8;30000;0;10;6;1;3;1;-2;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;6;8;6;30000;0;8;6;1;2;1;-2;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;6;8;7;30000;0;10;6;1;6;1;1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;6;8;8;30000;0;10;6;1;2;1;0;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;4;4;30000;0;13;7;1;2;1;1;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;4;5;30000;0;11;7;1;2;1;0;-2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;4;7;30000;0;13;7;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;4;9;30000;0;11;7;1;3;1;-1;2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;4;13;30000;0;12;7;1;7;7;0;-1;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;5;4;30000;0;12;7;1;1;1;0;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;5;5;30000;0;11;7;1;3;1;0;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;5;7;30000;0;12;7;1;4;7;-2;0;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;5;9;30000;0;12;7;1;3;1;-2;-1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;5;13;30000;0;12;7;1;3;1;-1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;6;6;30000;0;12;7;1;3;1;0;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;6;7;30000;0;12;7;1;3;1;-1;2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;6;8;30000;0;12;7;1;2;1;1;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;7;4;30000;0;15;7;1;4;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;7;5;30000;0;12;7;1;5;1;-2;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;7;6;30000;0;10;7;1;4;1;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;7;7;30000;0;10;7;1;6;1;0;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;7;8;30000;0;12;7;1;5;1;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;7;9;30000;0;10;7;1;4;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;7;13;30000;0;12;7;1;7;1;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;8;6;30000;0;10;7;1;2;1;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;8;7;30000;0;12;7;1;3;1;0;4;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;8;8;30000;0;10;7;1;7;1;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;9;4;30000;0;14;7;1;1;1;-2;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;9;5;30000;0;10;7;1;4;1;-1;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;9;7;30000;0;12;7;1;5;1;1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;13;4;30000;0;14;7;1;4;1;1;-1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;13;5;30000;0;13;7;1;5;1;-1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;7;13;7;30000;0;12;7;1;4;1;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;7;13;13;30000;0;10;7;1;6;1;-2;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;8;6;6;30000;0;11;8;1;7;1;1;0;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;8;6;7;30000;0;12;8;1;6;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;8;6;8;30000;0;11;8;1;1;1;-1;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;8;7;6;30000;0;12;8;1;7;1;-1;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;8;7;7;30000;0;12;8;1;7;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;8;7;8;30000;0;11;8;1;1;1;-1;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;8;8;6;30000;0;10;8;1;2;1;0;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;8;8;7;30000;0;12;8;1;2;1;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;8;8;8;30000;0;12;8;1;2;1;-2;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;4;4;30000;0;13;9;1;2;1;0;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;4;5;30000;0;13;9;1;7;1;0;0;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;4;7;30000;0;12;9;1;7;1;-2;0;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;4;9;30000;0;10;8;1;7;9;1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;4;13;30000;0;12;9;1;9;9;-1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;5;4;30000;0;11;9;1;4;1;-2;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;5;5;30000;0;13;9;1;9;1;0;0;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;5;7;30000;0;10;8;1;2;9;1;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;5;9;30000;0;5;9;1;8;1;1;2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;7;4;30000;0;12;9;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;7;5;30000;0;8;8;1;9;1;1;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;7;7;30000;0;10;8;1;3;1;-1;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;9;4;30000;0;17;9;1;7;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;9;5;30000;0;17;9;1;4;1;-2;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;9;9;30000;0;12;9;1;4;1;-2;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;9;16;30000;0;19;9;1;8;1;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;9;22;30000;0;12;8;1;7;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;9;32;30000;0;6;9;1;8;1;-2;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;13;4;30000;0;13;9;1;2;1;-2;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;16;9;30000;0;16;9;1;7;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;16;16;30000;0;10;8;1;2;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;16;22;30000;0;6;8;1;2;1;-2;2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;22;9;30000;0;16;9;1;7;1;0;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;9;22;16;30000;0;12;9;1;5;1;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;22;22;30000;0;15;9;1;8;1;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;22;32;30000;0;15;9;1;7;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;32;9;30000;0;16;9;1;6;32;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;32;22;30000;0;10;9;1;6;32;1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;9;32;32;30000;0;12;9;1;2;32;0;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;10;4;4;30000;0;8;8;1;5;1;0;1;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;10;4;10;30000;0;10;8;1;3;10;0;3;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;10;10;4;30000;0;17;10;1;7;1;0;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;10;10;10;30000;0;12;10;1;7;1;1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;4;4;30000;0;8;8;1;7;1;1;-1;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;4;5;30000;0;8;8;1;2;13;-1;2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;4;7;30000;0;8;8;1;10;1;0;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;4;9;30000;0;9;8;1;10;13;0;2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;4;13;30000;0;6;8;1;9;13;-2;6;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;4;17;30000;0;5;8;1;7;13;0;4;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;4;32;30000;0;12;8;1;13;1;0;2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;5;4;30000;0;8;8;1;8;1;-2;3;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;5;5;30000;0;8;8;1;3;1;1;2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;5;7;30000;0;8;8;1;10;1;1;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;5;13;30000;0;9;8;1;5;13;1;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;5;17;30000;0;12;8;1;7;1;-1;-1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;5;28;30000;0;3;13;1;3;13;-2;2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;5;32;30000;0;3;8;1;3;1;0;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;7;4;30000;0;11;8;1;4;1;1;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;7;5;30000;0;9;8;1;4;1;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;7;7;30000;0;13;8;1;13;1;-2;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;7;13;30000;0;12;8;1;4;1;0;1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;9;4;30000;0;12;8;1;11;1;1;2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;13;4;30000;0;17;13;1;11;1;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;13;5;30000;0;17;13;1;9;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;13;7;30000;0;16;13;1;4;13;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;13;13;30000;0;12;13;1;6;1;-1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;13;17;30000;0;15;13;1;10;1;1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;13;28;30000;0;19;13;1;13;13;-1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;13;32;30000;0;19;13;1;9;1;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;17;4;30000;0;17;13;1;10;1;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;17;5;30000;0;17;13;1;6;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;17;13;30000;0;16;13;1;6;1;1;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;17;17;30000;0;15;13;1;3;1;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;17;32;30000;0;20;13;1;3;1;1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;32;4;30000;0;21;8;1;13;1;1;2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;32;5;30000;0;17;13;1;5;1;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;32;13;30000;0;18;13;1;2;1;-1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;13;32;17;30000;0;19;8;1;4;1;0;3;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;13;32;32;30000;0;15;13;1;11;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;14;14;14;30000;0;16;14;1;14;14;1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;14;14;55;30000;0;20;14;1;7;14;-1;2;0;1;0;1;0;1;1;0;0 +NVIDIA GH200 480GB [0x3528];3;14;55;14;30000;0;26;14;1;7;55;-2;1;0;1;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;14;55;55;30000;0;60;14;1;10;55;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;15;4;4;30000;0;8;8;1;3;1;1;2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;15;15;15;30000;0;21;15;1;10;1;-1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;16;9;9;30000;0;12;8;1;16;1;-2;3;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;16;9;16;30000;0;10;8;1;14;16;1;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;16;9;22;30000;0;12;8;1;14;16;-2;0;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;16;16;9;30000;0;16;8;1;3;1;0;2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;16;16;16;30000;0;12;8;1;13;1;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;16;16;22;30000;0;21;16;1;13;1;-2;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;16;22;9;30000;0;16;16;1;8;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;16;22;16;30000;0;12;16;1;15;1;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;16;22;22;30000;0;22;16;1;14;1;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;4;4;30000;0;8;8;1;9;1;-2;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;4;5;30000;0;10;8;1;14;1;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;4;13;30000;0;5;8;1;3;17;0;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;4;17;30000;0;12;8;1;14;17;0;-2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;4;32;30000;0;3;8;1;2;17;-1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;5;4;30000;0;8;8;1;15;1;1;2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;5;5;30000;0;8;8;1;15;17;-2;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;5;13;30000;0;10;8;1;7;17;0;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;5;17;30000;0;12;8;1;9;17;-2;-1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;5;32;30000;0;16;8;1;2;17;-1;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;13;4;30000;0;14;17;1;11;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;13;5;30000;0;13;17;1;5;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;13;13;30000;0;19;8;1;16;1;-1;2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;13;17;30000;0;17;17;1;7;1;1;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;13;32;30000;0;19;17;1;16;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;17;4;30000;0;23;17;1;7;1;-2;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;17;5;30000;0;23;17;1;15;1;-1;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;17;13;30000;0;21;17;1;8;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;17;17;30000;0;16;17;1;9;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;17;32;30000;0;5;17;1;3;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;32;4;30000;0;23;17;1;11;1;-2;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;32;5;30000;0;22;17;1;6;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;32;13;30000;0;15;17;1;9;1;-2;-1;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;17;32;17;30000;0;15;17;1;13;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;17;32;32;30000;0;20;17;1;17;1;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;20;20;20;30000;0;19;20;1;8;1;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;9;9;30000;0;10;8;1;9;1;0;3;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;22;9;16;30000;0;10;8;1;21;1;-1;1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;9;22;30000;0;12;8;1;6;22;1;0;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;9;32;30000;0;20;8;1;20;22;0;1;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;16;9;30000;0;19;8;1;5;1;-2;4;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;16;16;30000;0;19;8;1;15;1;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;16;22;30000;0;12;8;1;3;1;0;2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;22;9;30000;0;23;22;1;17;1;0;0;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;22;22;16;30000;0;20;22;1;5;1;1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;22;22;30000;0;23;22;1;14;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;22;32;30000;0;30;22;1;7;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;32;9;30000;0;17;22;1;20;32;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;32;22;30000;0;20;22;1;9;32;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;22;32;32;30000;0;40;22;1;17;32;-1;3;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;23;23;23;30000;0;20;23;1;16;23;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;24;24;24;30000;0;20;24;1;16;1;-2;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;25;4;4;30000;0;8;8;1;13;25;0;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;25;4;5;30000;0;8;8;1;18;25;0;5;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;25;5;4;30000;0;8;8;1;21;1;-2;1;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;25;25;25;30000;0;24;25;1;21;1;-1;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;26;4;4;30000;0;10;8;1;6;26;-1;3;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;28;4;4;30000;0;11;8;1;12;1;-1;5;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;28;28;28;30000;0;6;16;1;8;1;-2;2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;30;30;30;30000;0;50;30;1;19;1;-1;-2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;32;4;4;30000;0;11;8;1;15;32;-1;-2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;32;4;5;30000;0;11;8;1;16;32;1;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;4;13;30000;0;10;8;1;1;32;1;0;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;32;4;17;30000;0;12;8;1;9;32;1;0;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;4;32;30000;0;6;8;1;21;32;1;4;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;5;4;30000;0;14;8;1;14;1;1;-2;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;32;5;5;30000;0;15;8;1;10;1;-2;-2;0;0;0;1;0;2;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;5;13;30000;0;10;8;1;31;32;1;-2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;5;17;30000;0;12;8;1;3;32;0;4;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;5;32;30000;0;15;8;1;25;32;0;0;0;0;0;1;0;2;0;0;0 +NVIDIA GH200 480GB [0x3528];3;32;13;4;30000;0;23;8;1;4;1;-1;-2;0;0;0;1;0;0;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;13;5;30000;0;23;8;1;4;1;-1;-2;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;32;13;13;30000;0;20;8;1;18;1;-1;5;0;0;0;1;0;0;0;0;0 +NVIDIA GH200 480GB [0x3528];3;32;13;17;30000;0;30;8;1;5;1;-2;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;13;32;30000;0;13;8;1;16;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;17;4;30000;0;23;32;1;17;1;-1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;17;5;30000;0;18;32;1;14;1;-1;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;17;13;30000;0;20;32;1;11;1;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;17;17;30000;0;20;32;1;4;1;-2;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;17;32;30000;0;10;8;1;23;1;1;4;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;32;4;30000;0;26;32;1;4;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;32;5;30000;0;29;32;1;11;1;1;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;32;9;30000;0;23;32;1;21;32;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;32;13;30000;0;30;32;1;6;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;32;17;30000;0;23;32;1;22;1;0;-1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;32;22;30000;0;20;32;1;27;32;0;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;32;32;32;30000;0;19;32;1;29;1;-1;-2;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;35;35;35;30000;0;57;24;1;2;1;0;2;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;36;36;36;30000;0;15;36;1;2;1;-1;3;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;40;40;40;30000;0;57;16;1;6;1;-1;1;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;55;14;14;30000;0;23;55;1;29;55;-1;4;0;0;0;1;0;1;0;0;0 +NVIDIA GH200 480GB [0x3528];3;55;14;55;30000;0;8;55;1;2;55;1;7;0;0;0;1;0;1;1;0;0 +NVIDIA GH200 480GB [0x3528];3;55;55;14;30000;0;60;55;1;51;55;0;-2;0;1;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;55;55;55;30000;0;13;55;1;5;55;0;0;0;0;0;1;0;1;2;0;0 +NVIDIA GH200 480GB [0x3528];3;64;64;64;30000;0;5;64;1;18;64;0;6;0;0;0;1;0;1;0;0;0 diff --git a/src/acc/opencl/smm/params/tune_multiply_Mi250.csv b/src/acc/opencl/smm/params/tune_multiply_Mi250.csv index 4410500501c..d73036e2ca9 100644 --- a/src/acc/opencl/smm/params/tune_multiply_Mi250.csv +++ b/src/acc/opencl/smm/params/tune_multiply_Mi250.csv @@ -281,6 +281,7 @@ gfx90a [0x989f];3;12;23;12;30000;0;20;12;1;1;1;0;-2;0;0;1;1;1;2;2;1;0 gfx90a [0x989f];3;12;23;23;30000;0;3;8;1;8;1;-2;-2;0;0;0;1;0;1;2;0;0 gfx90a [0x989f];3;13;13;13;30000;0;12;13;1;11;1;0;-1;0;1;0;1;1;0;2;1;0 gfx90a [0x989f];3;13;13;23;30000;0;12;13;1;1;1;-2;-2;0;1;0;1;1;2;0;1;0 +gfx90a [0x989f];3;13;13;32;30000;0;3;13;1;9;13;0;2;0;0;0;1;0;1;2;0;0 gfx90a [0x989f];3;13;23;13;30000;0;30;8;1;10;1;1;2;0;0;0;1;0;1;0;0;0 gfx90a [0x989f];3;13;23;23;30000;0;3;8;1;6;1;-1;2;0;0;0;1;0;1;2;0;0 gfx90a [0x989f];3;14;14;14;30000;0;10;14;1;10;1;0;0;0;1;1;1;1;0;2;0;0 @@ -351,6 +352,7 @@ gfx90a [0x989f];3;18;23;23;30000;0;4;8;1;16;1;0;3;0;0;0;1;0;1;2;0;0 gfx90a [0x989f];3;19;19;19;30000;0;40;8;1;10;1;1;3;0;0;0;1;0;1;2;0;0 gfx90a [0x989f];3;19;19;23;30000;0;40;8;1;15;1;-1;-2;0;0;0;1;0;1;0;0;0 gfx90a [0x989f];3;23;23;23;30000;0;4;8;1;22;23;-1;3;0;0;0;1;0;1;0;0;0 +gfx90a [0x989f];3;28;28;28;30000;0;3;28;1;28;28;-2;2;0;0;0;1;0;1;0;0;0 gfx90a [0x989f];3;32;32;32;30000;0;25;32;1;20;1;-2;0;0;1;0;1;0;2;0;0;0 gfx90a [0x989f];3;35;17;17;30000;0;15;35;1;29;1;1;0;0;1;0;1;0;2;1;0;0 gfx90a [0x989f];3;35;17;32;30000;0;20;35;1;1;1;0;-2;1;1;0;1;1;2;0;0;0 diff --git a/src/acc/opencl/smm/params/tune_multiply_PVC.csv b/src/acc/opencl/smm/params/tune_multiply_PVC.csv index 34e16e0b964..5b5a9648737 100644 --- a/src/acc/opencl/smm/params/tune_multiply_PVC.csv +++ b/src/acc/opencl/smm/params/tune_multiply_PVC.csv @@ -2,7 +2,7 @@ DEVICE;TYPEID;M;N;K;S;GFLOPS;BS;BM;BN;BK;WS;WG;LU;NZ;AL;TB;TC;AP;AA;AB;AC Intel(R) Data Center GPU Max 1550 [0x0bd5];3;2;2;2;30000;0;8;2;1;1;1;-1;0;0;1;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;3;3;1;30000;0;8;3;1;1;1;1;1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;3;3;3;30000;0;8;3;1;1;1;-1;-2;0;0;0;1;0;2;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;4;30000;0;10;4;1;4;1;1;-1;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;4;30000;0;9;4;1;3;1;-1;1;0;1;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;5;30000;0;10;4;1;3;1;-1;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;7;30000;0;9;4;1;3;1;1;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;9;30000;0;8;4;1;2;1;-2;-2;0;0;0;1;0;2;2;0;0 @@ -13,7 +13,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;17;30000;0;8;4;1;1;1;-1;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;25;30000;0;8;4;1;1;1;-2;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;26;30000;0;8;4;1;1;1;-1;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;28;30000;0;8;4;1;1;1;1;1;0;0;0;1;0;2;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;32;30000;0;8;4;1;1;1;-2;-1;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;32;30000;0;8;4;1;1;1;-2;-2;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;45;30000;0;8;4;1;1;4;-1;-1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;5;4;30000;0;10;4;1;1;1;-2;0;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;5;5;30000;0;10;4;1;3;1;0;-2;0;0;0;1;0;0;0;0;0 @@ -71,7 +71,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;17;32;30000;0;8;4;1;1;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;4;30000;0;14;4;1;2;1;-2;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;5;30000;0;14;4;1;4;1;1;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;7;30000;0;14;4;1;3;25;1;1;0;0;0;1;0;2;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;9;30000;0;10;4;1;1;25;0;0;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;9;30000;0;10;4;1;1;25;1;0;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;13;30000;0;8;4;1;1;25;-1;0;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;25;30000;0;8;4;1;1;25;-2;0;0;1;0;1;1;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;26;30000;0;8;4;1;1;25;-2;0;0;0;0;1;1;0;0;0;0 @@ -131,16 +131,19 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;4;28;30000;0;8;5;1;1;5;-2;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;4;32;30000;0;8;5;1;1;1;-2;1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;4;45;30000;0;8;5;1;1;5;-2;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;4;30000;0;12;5;1;1;1;1;-2;0;0;0;1;0;0;2;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;5;30000;0;12;5;1;1;1;1;1;0;0;0;1;0;0;1;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;5;30000;0;10;5;1;4;1;-1;0;0;1;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;7;30000;0;8;5;1;1;1;-2;0;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;9;30000;0;8;5;1;1;1;-1;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;13;30000;0;8;5;1;1;1;-2;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;16;30000;0;8;5;1;1;1;-2;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;17;30000;0;8;5;1;1;1;-1;-1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;20;30000;0;8;5;1;1;5;1;0;0;1;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;24;30000;0;8;5;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;26;30000;0;8;5;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;28;30000;0;8;5;1;1;5;-2;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;32;30000;0;8;5;1;1;1;-2;-1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;6;20;30000;0;8;5;1;1;6;-1;-1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;6;32;30000;0;8;5;1;1;6;1;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;7;4;30000;0;12;5;1;1;1;-2;1;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;7;5;30000;0;12;5;1;1;1;-1;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;7;7;30000;0;10;5;1;1;1;-2;-1;0;0;0;1;0;0;2;0;0 @@ -156,6 +159,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;7;30000;0;14;5;1;1;1;-1;1;0;0; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;13;30000;0;10;5;1;1;1;1;0;0;1;1;1;1;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;16;30000;0;8;5;1;1;1;-2;-1;1;0;0;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;17;30000;0;8;5;1;1;1;-1;-1;1;1;0;1;1;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;20;30000;0;8;5;1;1;13;-2;-1;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;24;30000;0;8;5;1;1;1;-2;4;1;1;0;1;1;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;26;30000;0;8;5;1;1;1;-1;3;0;0;0;1;1;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;32;30000;0;8;5;1;1;1;-1;-2;0;0;0;1;1;2;2;0;0 @@ -171,6 +175,11 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;5;30000;0;16;5;1;1;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;13;30000;0;8;5;1;1;1;-2;0;1;0;0;1;1;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;17;30000;0;8;5;1;1;1;-1;-2;0;1;0;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;32;30000;0;8;5;1;1;1;-1;-2;0;0;1;1;1;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;5;30000;0;16;5;1;2;20;1;1;0;1;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;6;30000;0;14;5;1;1;20;-1;1;0;0;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;13;30000;0;14;5;1;1;20;-1;-2;0;1;0;1;0;2;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;20;30000;0;8;5;1;1;20;-1;-2;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;32;30000;0;8;5;1;1;20;1;-2;0;1;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;24;5;30000;0;16;5;1;1;1;1;1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;24;13;30000;0;15;5;1;1;1;0;1;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;24;16;30000;0;15;5;1;1;1;-1;1;0;0;0;1;0;2;2;0;0 @@ -185,7 +194,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;13;30000;0;8;5;1;1;25;0;-1;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;25;30000;0;8;5;1;1;25;0;-1;1;0;0;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;26;30000;0;8;5;1;1;25;-2;-2;0;0;0;1;1;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;28;30000;0;8;5;1;1;25;-1;0;0;0;0;1;1;2;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;32;30000;0;8;5;1;1;25;0;1;0;0;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;32;30000;0;8;5;1;1;25;1;0;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;45;30000;0;8;5;1;1;25;-1;0;0;0;0;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;26;4;30000;0;18;5;1;1;26;-2;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;26;5;30000;0;18;5;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0 @@ -212,10 +221,12 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;28;32;30000;0;8;5;1;1;28;-2;-2;1; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;28;45;30000;0;8;5;1;1;28;1;-2;0;1;0;1;1;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;4;30000;0;18;5;1;5;1;0;-1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;5;30000;0;17;5;1;1;1;1;-2;0;0;0;1;0;2;1;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;6;30000;0;18;5;1;2;32;0;-1;0;1;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;7;30000;0;15;5;1;1;32;-2;-2;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;9;30000;0;15;5;1;1;32;1;-1;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;13;30000;0;15;5;1;1;1;1;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;17;30000;0;8;5;1;1;1;-1;-2;0;1;0;1;0;2;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;20;30000;0;18;5;1;1;32;-1;1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;24;30000;0;8;5;1;1;32;-1;0;0;0;0;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;25;30000;0;8;5;1;1;32;-1;2;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;26;30000;0;8;5;1;1;32;-2;-1;0;0;0;1;0;0;0;0;0 @@ -232,15 +243,31 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;26;30000;0;6;5;1;1;45;-2;-1;0; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;28;30000;0;5;5;1;1;45;-1;-2;1;0;1;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;32;30000;0;4;5;1;1;45;-1;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;45;30000;0;9;5;1;1;45;-2;-1;0;0;0;1;1;0;2;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;5;20;30000;0;8;6;1;1;6;-2;-1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;5;32;30000;0;8;6;1;1;6;-2;-1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;6;30000;0;9;6;1;3;1;-2;0;0;1;0;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;7;30000;0;10;6;1;1;1;-1;-1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;8;30000;0;10;6;1;1;1;-2;-1;0;0;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;20;30000;0;10;6;1;1;6;-2;-2;0;1;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;32;30000;0;10;6;1;1;6;-1;-1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;7;6;30000;0;14;6;1;1;1;-1;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;7;7;30000;0;10;6;1;1;1;-2;0;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;7;8;30000;0;11;6;1;1;1;1;-2;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;8;6;30000;0;16;6;1;1;1;-1;0;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;8;7;30000;0;14;6;1;1;1;-1;-2;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;8;8;30000;0;13;6;1;1;1;-1;0;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;13;20;30000;0;10;6;1;1;13;1;1;0;0;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;13;32;30000;0;8;6;1;1;13;1;1;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;5;30000;0;18;6;1;1;20;-1;1;0;0;0;1;0;0;1;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;6;30000;0;15;6;1;1;20;-2;1;0;0;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;13;30000;0;14;6;1;1;20;-1;-1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;20;30000;0;8;6;1;1;20;-1;-1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;32;30000;0;8;6;1;1;20;0;-2;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;5;30000;0;20;6;1;6;32;1;-1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;6;30000;0;18;6;1;6;32;1;1;0;0;0;1;0;2;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;13;30000;0;15;6;1;1;32;1;1;0;0;0;1;0;2;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;20;30000;0;8;6;1;1;32;-1;1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;32;30000;0;8;6;1;1;32;1;-2;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;4;4;30000;0;14;7;1;1;1;-2;1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;4;5;30000;0;14;7;1;3;1;1;0;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;4;7;30000;0;11;7;1;1;1;-2;0;0;0;0;1;0;2;0;0;0 @@ -267,7 +294,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;6;8;30000;0;10;7;1;1;1;1;1;0;0;0; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;4;30000;0;14;7;1;1;1;1;-1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;5;30000;0;16;7;1;1;1;-1;0;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;6;30000;0;16;7;1;1;1;1;-1;0;0;0;1;0;0;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;7;30000;0;9;7;1;2;1;1;-2;0;1;0;1;1;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;7;30000;0;9;7;1;3;1;0;-2;0;1;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;8;30000;0;14;7;1;1;1;1;-1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;9;30000;0;13;7;1;1;1;-2;-1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;13;30000;0;10;7;1;1;1;-1;-2;0;0;0;1;0;0;2;0;0 @@ -279,7 +306,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;45;30000;0;8;7;1;1;7;-1;-1;0;0; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;8;6;30000;0;16;7;1;1;1;-2;-2;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;8;7;30000;0;14;7;1;1;1;1;1;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;8;8;30000;0;13;7;1;1;1;1;1;0;0;0;1;0;2;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;4;30000;0;16;7;1;1;1;-1;0;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;4;30000;0;16;7;1;2;1;-1;-2;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;5;30000;0;16;7;1;1;1;-2;-1;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;7;30000;0;17;7;1;1;1;-1;0;0;1;0;1;1;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;9;30000;0;14;7;1;1;9;-2;-2;0;0;0;1;0;2;0;0;0 @@ -405,8 +432,8 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;13;30000;0;15;9;1;2;28;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;25;30000;0;8;9;1;1;28;0;0;0;1;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;26;30000;0;10;9;1;1;28;-2;0;0;1;1;1;0;0;0;1;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;28;30000;0;12;9;1;1;28;0;0;0;0;0;1;0;2;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;32;30000;0;15;9;1;5;28;-2;-1;0;0;0;1;0;2;0;0;1 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;45;30000;0;19;9;1;1;28;1;0;0;0;0;1;0;0;0;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;32;30000;0;15;9;1;4;28;-2;-1;0;0;0;1;0;1;0;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;45;30000;0;15;9;1;1;28;1;0;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;32;4;30000;0;39;9;1;1;32;-1;-1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;32;5;30000;0;38;9;1;1;32;-1;-1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;32;7;30000;0;25;9;1;8;32;0;0;0;0;0;1;0;1;2;0;0 @@ -430,6 +457,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;10;15;4;30000;0;24;10;1;1;15;-1;-2; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;10;15;10;30000;0;16;10;1;3;15;-2;-1;0;0;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;10;15;15;30000;0;15;10;1;1;15;1;-1;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;12;12;12;30000;0;8;12;1;12;12;-2;-1;1;1;1;1;1;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;1;11;30000;0;8;13;1;1;13;-1;1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;4;4;30000;0;16;13;1;1;1;-2;-2;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;4;5;30000;0;13;13;1;1;1;-1;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;4;7;30000;0;13;13;1;2;1;1;0;0;0;0;1;0;0;2;0;0 @@ -448,12 +476,15 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;9;30000;0;13;13;1;3;13;-1;-1;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;13;30000;0;8;13;1;3;1;-1;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;16;30000;0;8;13;1;1;1;-2;0;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;17;30000;0;8;13;1;7;1;-2;-2;0;1;0;1;1;0;2;1;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;20;30000;0;8;13;1;1;13;-2;-1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;24;30000;0;8;13;1;1;1;-2;1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;25;30000;0;8;13;1;1;13;1;-2;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;26;30000;0;8;13;1;10;1;-1;-2;0;0;0;1;0;0;2;1;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;28;30000;0;8;13;1;1;13;-2;0;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;32;30000;0;8;13;1;1;1;-2;0;0;1;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;45;30000;0;8;13;1;1;13;-1;0;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;6;20;30000;0;8;13;1;1;13;-1;0;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;6;32;30000;0;8;13;1;1;13;-1;-1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;7;4;30000;0;17;13;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;7;5;30000;0;16;13;1;1;1;-2;0;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;7;7;30000;0;13;13;1;12;1;-2;1;0;0;0;1;0;2;2;0;0 @@ -481,11 +512,12 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;9;30000;0;24;13;1;1;13;-1;-2; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;13;30000;0;8;13;1;4;1;-1;-2;0;1;1;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;14;30000;0;18;13;1;1;13;-1;1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;17;30000;0;15;13;1;1;1;-1;1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;20;30000;0;24;13;1;1;13;-1;0;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;24;30000;0;12;13;1;1;13;-1;-1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;25;30000;0;8;13;1;1;13;-1;0;1;0;1;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;26;30000;0;14;13;1;1;13;-1;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;28;30000;0;8;13;1;1;13;-1;-1;1;1;0;1;1;0;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;32;30000;0;8;13;1;1;1;-1;0;1;0;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;32;30000;0;8;13;1;1;1;-2;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;45;30000;0;8;13;1;1;13;-2;0;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;14;13;30000;0;15;13;1;1;14;-2;1;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;14;14;30000;0;15;13;1;1;14;-2;1;0;0;0;1;0;2;0;0;0 @@ -498,6 +530,11 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;5;30000;0;19;13;1;10;1;0;-1;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;13;30000;0;9;13;1;9;1;1;-1;1;1;0;1;1;2;0;1;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;17;30000;0;8;13;1;1;1;-2;1;1;0;1;1;1;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;32;30000;0;8;13;1;1;1;1;-2;1;0;0;1;1;2;0;1;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;5;30000;0;17;13;1;8;20;0;1;0;0;0;1;0;1;1;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;6;30000;0;9;13;1;12;20;-1;-2;0;0;0;1;0;1;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;13;30000;0;11;13;1;9;20;1;-1;0;0;0;1;0;1;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;20;30000;0;17;8;1;6;20;0;-1;0;0;0;1;0;1;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;32;30000;0;15;8;1;1;20;-2;1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;24;5;30000;0;22;13;1;5;1;1;-1;1;1;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;24;13;30000;0;13;13;1;10;24;-1;0;0;0;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;24;24;30000;0;8;13;1;1;24;1;1;1;0;0;1;1;0;0;0;0 @@ -533,20 +570,22 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;9;30000;0;16;13;1;10;28;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;13;30000;0;13;13;1;11;28;1;1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;25;30000;0;15;13;1;1;28;-1;-2;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;26;30000;0;15;13;1;4;28;1;-2;0;0;0;1;0;1;0;0;1 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;28;30000;0;16;13;1;1;28;1;-1;0;0;0;1;0;0;0;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;28;30000;0;15;13;1;1;28;1;0;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;32;30000;0;15;13;1;1;28;1;-2;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;45;30000;0;15;13;1;1;28;0;1;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;4;30000;0;25;13;1;7;1;-1;0;0;0;0;1;0;1;1;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;5;30000;0;25;13;1;11;1;0;-2;0;0;0;1;0;1;1;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;6;30000;0;25;13;1;5;32;-2;0;0;0;0;1;0;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;7;30000;0;25;13;1;4;32;0;0;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;9;30000;0;16;13;1;7;32;-2;0;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;13;30000;0;13;13;1;13;1;-1;-1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;14;30000;0;13;13;1;12;32;-2;-1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;17;30000;0;8;13;1;1;1;-1;1;1;1;0;1;1;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;20;30000;0;15;13;1;1;32;-1;2;0;0;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;24;30000;0;16;13;1;1;32;-2;-2;1;1;0;1;0;2;2;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;25;30000;0;15;13;1;1;32;0;-2;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;26;30000;0;15;13;1;1;32;0;-2;0;0;0;1;0;0;0;0;1 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;28;30000;0;15;13;1;4;32;-2;1;0;0;0;1;0;2;2;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;28;30000;0;15;13;1;4;32;1;0;0;0;0;1;0;1;2;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;32;30000;0;15;13;1;1;1;-2;0;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;45;30000;0;15;13;1;1;32;-1;1;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;45;4;30000;0;30;13;1;11;45;1;1;0;0;0;1;0;0;1;0;0 @@ -625,7 +664,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;5;30000;0;35;16;1;1;1;-2;0;0; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;9;30000;0;25;16;1;1;1;-1;-2;1;1;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;14;30000;0;25;16;1;1;16;-1;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;16;30000;0;8;16;1;3;1;-2;1;0;1;0;1;0;0;2;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;22;30000;0;15;16;1;1;1;-1;0;0;0;0;1;0;0;0;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;22;30000;0;15;16;1;1;1;-1;1;0;0;0;1;0;0;2;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;29;30000;0;25;16;1;1;16;1;1;0;0;0;1;0;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;55;30000;0;8;16;1;1;16;-2;0;1;1;0;1;1;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;22;9;30000;0;21;16;1;9;1;-2;-1;1;1;1;1;1;0;0;1;0 @@ -636,7 +675,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;26;5;30000;0;20;16;1;15;1;-1;1;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;14;30000;0;15;8;1;13;29;-1;0;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;16;30000;0;15;8;1;13;29;1;-1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;29;30000;0;16;16;1;1;29;1;0;1;0;1;1;1;0;0;0;1 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;55;30000;0;15;16;1;1;29;1;-1;0;0;0;1;0;0;0;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;55;30000;0;15;16;1;1;29;1;0;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;55;16;30000;0;32;16;1;1;55;1;-1;1;1;1;1;0;2;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;55;29;30000;0;30;16;1;1;55;-1;0;1;0;0;1;1;2;2;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;55;55;30000;0;15;16;1;1;55;1;1;1;1;0;1;0;2;0;0;0 @@ -662,7 +701,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;13;30000;0;8;17;1;1;1;0;1;1;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;17;30000;0;8;17;1;4;1;-1;-1;0;1;1;1;1;2;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;32;30000;0;15;17;1;1;1;-2;1;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;35;30000;0;15;17;1;1;1;-2;-1;0;0;0;1;0;0;0;0;1 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;4;30000;0;25;17;1;7;1;-2;-2;0;0;0;1;0;1;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;4;30000;0;25;17;1;6;1;-2;-2;0;0;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;5;30000;0;28;17;1;8;1;-2;1;0;0;1;1;1;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;13;30000;0;11;17;1;1;0;-2;1;1;0;1;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;17;30000;0;15;17;1;10;1;-2;-2;0;0;0;1;0;2;2;0;1 @@ -671,7 +710,15 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;35;30000;0;15;17;1;1;1;-1;-1; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;35;17;30000;0;30;17;1;1;1;-2;1;0;0;0;1;0;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;35;32;30000;0;30;17;1;1;1;1;-2;0;1;0;1;1;0;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;35;35;30000;0;30;17;1;1;1;0;-2;1;1;1;1;1;2;0;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;5;30000;0;13;20;1;11;20;-2;1;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;6;30000;0;15;8;1;14;20;-2;-2;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;20;30000;0;8;20;1;1;1;-1;1;0;1;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;32;30000;0;15;20;1;1;20;0;3;0;0;0;1;0;2;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;5;30000;0;21;20;1;14;32;-2;1;0;0;0;1;0;2;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;6;30000;0;18;20;1;9;32;1;0;0;0;0;1;0;1;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;13;30000;0;18;8;1;7;32;-1;2;0;0;0;1;0;1;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;20;30000;0;16;20;1;4;32;1;2;0;0;0;1;0;1;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;32;30000;0;6;8;1;13;32;0;1;0;0;0;1;0;1;1;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;22;9;9;30000;0;14;22;1;1;1;-1;-1;1;1;0;1;1;0;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;22;9;16;30000;0;10;22;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;22;9;22;30000;0;8;22;1;1;1;-1;0;0;0;0;1;1;0;2;0;0 @@ -941,7 +988,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;13;30000;0;17;26;1;12;45;0;1; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;25;30000;0;10;8;1;1;45;1;-2;0;0;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;26;30000;0;5;26;1;1;45;-2;1;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;28;30000;0;1;26;1;19;45;-2;1;0;0;0;1;0;0;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;32;30000;0;17;8;1;5;45;-2;2;0;0;0;1;0;1;2;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;32;30000;0;17;16;1;24;45;-2;2;0;0;0;1;0;1;2;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;45;30000;0;5;26;1;1;45;1;1;0;1;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;28;4;4;30000;0;16;28;1;7;1;-2;4;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;28;4;5;30000;0;10;8;1;26;28;1;1;0;0;0;1;0;0;2;0;0 @@ -1063,7 +1110,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;32;32;30000;0;30;29;1;1;32;-2;5; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;32;55;30000;0;5;16;1;1;32;1;3;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;16;30000;0;15;29;1;12;55;0;1;0;0;0;1;0;1;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;29;30000;0;27;8;1;1;55;-2;4;0;0;0;1;0;2;0;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;32;30000;0;14;8;1;23;55;0;3;0;0;0;1;0;1;1;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;32;30000;0;14;16;1;24;55;0;3;0;0;0;1;0;1;1;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;55;30000;0;3;16;1;1;55;0;2;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;30;30;30;30000;0;8;30;1;1;1;-2;1;0;0;1;1;1;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;30;30;76;30000;0;8;30;1;1;30;-1;1;0;0;0;1;0;0;0;0;0 @@ -1087,7 +1134,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;9;30000;0;10;8;1;18;32;-2;1;0; Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;13;30000;0;8;32;1;26;1;-2;-2;0;1;0;1;0;0;2;1;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;17;30000;0;8;32;1;1;1;-1;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;24;30000;0;8;8;1;1;32;0;-1;0;0;0;1;0;2;1;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;25;30000;0;8;8;1;1;32;-1;-1;0;0;0;1;0;0;1;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;25;30000;0;8;8;1;1;32;-1;0;0;0;0;1;0;0;1;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;26;30000;0;8;8;1;1;32;-2;0;0;0;0;1;0;0;1;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;28;30000;0;8;8;1;1;32;-2;4;0;0;0;1;0;2;1;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;32;30000;0;8;32;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0 @@ -1137,6 +1184,11 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;5;30000;0;29;8;1;31;1;1;1;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;13;30000;0;27;8;1;23;1;0;-1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;17;30000;0;15;32;1;1;1;-1;1;1;0;1;1;1;2;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;32;30000;0;7;8;1;1;1;-2;4;0;0;0;1;0;2;1;0;1 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;5;30000;0;40;8;1;29;32;-2;4;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;6;30000;0;31;32;1;4;32;-2;4;0;0;0;1;0;0;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;13;30000;0;27;32;1;2;32;-1;4;0;0;0;1;0;1;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;20;30000;0;12;32;1;1;32;1;4;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;32;30000;0;3;32;1;22;32;-2;4;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;22;9;30000;0;27;8;1;8;32;1;-1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;22;22;30000;0;12;8;1;1;1;1;4;1;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;22;32;30000;0;9;8;1;1;1;-1;-1;0;0;0;1;0;1;1;0;1 @@ -1184,11 +1236,13 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;29;32;30000;0;59;8;1;1;32;0;-1;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;29;55;30000;0;59;8;1;1;32;-1;4;0;1;1;1;0;1;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;4;30000;0;41;8;1;1;1;-2;1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;5;30000;0;30;8;1;13;1;1;1;0;0;0;1;0;1;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;6;30000;0;41;32;1;1;32;-2;4;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;7;30000;0;30;8;1;14;32;1;-2;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;9;30000;0;30;8;1;8;1;1;1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;13;30000;0;30;8;1;17;1;1;1;0;0;0;1;0;1;2;0;0 -Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;14;30000;0;30;8;1;13;32;0;0;0;0;0;1;0;1;2;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;14;30000;0;30;8;1;22;32;0;-1;0;0;0;1;0;1;2;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;17;30000;0;11;32;1;1;1;-1;1;0;0;0;1;0;0;0;0;0 +Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;20;30000;0;30;32;1;1;32;-1;4;0;0;0;1;0;2;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;22;30000;0;8;32;1;1;1;1;1;0;0;0;1;0;2;0;0;1 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;24;30000;0;8;32;1;1;1;-2;1;0;0;0;1;0;0;0;0;0 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;25;30000;0;30;8;1;1;32;1;1;0;0;0;1;0;2;0;0;0 diff --git a/src/acc/opencl/smm/tune_multiply.py b/src/acc/opencl/smm/tune_multiply.py index 40d68c82f6f..7080b8ba29f 100755 --- a/src/acc/opencl/smm/tune_multiply.py +++ b/src/acc/opencl/smm/tune_multiply.py @@ -14,8 +14,7 @@ from opentuner import MeasurementInterface from opentuner import Result from signal import signal, SIGINT -import tempfile -import shutil +import tempfile # , shutil import copy import json import glob @@ -123,6 +122,10 @@ def __init__(self, args): device = re.search(devicepat, str(self.run_result["stderr"])) self.ndevices = int(device.group(1)) if device and device.group(1) else 0 self.device = device.group(2) if device and device.group(2) else "" + # idevice: make certain resources/names unique on a per-rank basis + envrank = os.getenv("PMI_RANK", os.getenv("OMPI_COMM_WORLD_LOCAL_RANK")) + if envrank: + self.idevice = int(envrank) % self.ndevices elif self.args.update is not None and "" != self.args.update: self.device = self.args.update if self.run_result and 0 == self.run_result["returncode"]: @@ -198,14 +201,15 @@ def __init__(self, args): and (self.size and 0 < self.size) ): # setup database (DB) if self.args.database is None: # adjust DB-location - envrank = os.getenv("PMI_RANK", os.getenv("OMPI_COMM_WORLD_LOCAL_RANK")) tmpdir = os.path.join(tempfile.gettempdir(), "opentuner") - if envrank: - self.idevice = int(envrank) % self.ndevices + if self.idevice is not None: tmpdir += str(self.idevice) - if os.path.isdir(tmpdir): - shutil.rmtree(tmpdir) - os.mkdir(tmpdir) + # if os.path.isdir(tmpdir): + # shutil.rmtree(tmpdir) + try: + os.mkdir(tmpdir) + except: # noqa: E722 + pass self.args.database = "sqlite:///" + os.path.join( tmpdir, "{}.db".format(os.getpid()) ) @@ -267,7 +271,7 @@ def launch(self, envs, check, nrep=None, verbose=None): if verbose is not None and 0 != int(verbose): msg = env_exe.replace("OPENCL_LIBSMM_SMM_", "") print("{}: {}".format("x".join(map(str, mnk)), msg)) - env_std = "OMP_PROC_BIND=TRUE OPENCL_LIBSMM_SMM_S=0 NEO_CACHE_PERSISTENT=0" + env_std = "OMP_PROC_BIND=TRUE OPENCL_LIBSMM_SMM_S=0 NEO_CACHE_PERSISTENT=0 CUDA_CACHE_DISABLE=1" env_check = "CHECK={}".format(check if check is not None else 1) env_intrn = "{} {}".format( # consider device-id "" if self.idevice is None else "ACC_OPENCL_DEVICE={}".format(self.idevice), @@ -587,18 +591,15 @@ def save_final_config(self, configuration, final=True): except: # noqa: E722 pass gflops = data["GFLOPS"] if data and "GFLOPS" in data else 0 - filename = os.path.join( - self.args.jsondir, - ( - "{}-{}gflops.json".format(self.args.label, round(gflops)) - if 0 < gflops - else "{}.json".format(self.args.label) - ), - ) - try: - os.rename(filedot, filename) - except: # noqa: E722 - pass + if 0 < gflops: + filename = os.path.join( + self.args.jsondir, + "{}-{}gflops.json".format(self.args.label, round(gflops)), + ) + try: + os.rename(filedot, filename) + except: # noqa: E722 + pass # self.manipulator().save_to_file(config, filename) with open(filedot, "w") as file: cfg = config @@ -614,7 +615,7 @@ def save_final_config(self, configuration, final=True): mnk = "x".join(map(str, self.mnk)) print("FAILED[{}] {}: {}".format(result, mnk, failed), flush=True) return - if final and os.path.exists(filedot): + if final and 0 < self.gflops and os.path.exists(filedot): filepattern = "{}-*.json".format(default_basename) fileglobs = glob.glob( os.path.normpath(os.path.join(self.args.jsondir, filepattern)) @@ -905,8 +906,6 @@ def handle_sigint(self, signum, frame): # OPENCL_LIBSMM_SMM_xx=tune|enabled|on must be given to permit tuning) if os.getenv("OPENCL_LIBSMM_SMM_WS") not in default_enable_tune: os.environ["OPENCL_LIBSMM_SMM_WS"] = "{}".format(args.ws) - if os.getenv("OPENCL_LIBSMM_SMM_AL") not in default_enable_tune: - os.environ["OPENCL_LIBSMM_SMM_AL"] = "{}".format(args.al) # fix tunables according to level of tuning if 1 <= args.tlevel or 0 > args.tlevel: os.environ["OPENCL_LIBSMM_SMM_BM"] = "{}".format(args.bm) @@ -932,7 +931,7 @@ def handle_sigint(self, signum, frame): line = file.readline() if not line: break - args.mnk = line.strip() + args.mnk, args.label = line.strip(), "" if args.mnk: start(args) print("") @@ -944,6 +943,4 @@ def handle_sigint(self, signum, frame): args.merge = -1 start(args) else: - if not args.mnk: # parse and sanitize kernel shape - args.mnk = default_mnk start(args) diff --git a/src/base/dbcsr_machine.F b/src/base/dbcsr_machine.F index 3428cf2ee19..8cebf095494 100644 --- a/src/base/dbcsr_machine.F +++ b/src/base/dbcsr_machine.F @@ -17,7 +17,7 @@ MODULE dbcsr_machine m_abort, m_chdir, m_flush_internal => m_flush, m_getarg, m_getcwd, m_getlog, m_getpid, & m_hostnm, m_iargc, m_memory, m_memory_details, m_memory_max, m_mov, m_procrun -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, OMP_GET_WTIME +!$ USE OMP_LIB, ONLY: omp_get_wtime IMPLICIT NONE @@ -100,7 +100,7 @@ FUNCTION m_walltime() RESULT(wt) wt = (REAL(count, KIND=dp) + REAL(cycles, KIND=dp)*(1.0_dp + REAL(count_max, KIND=dp))) & /REAL(count_rate, KIND=dp) !$ ELSE -!$ wt = OMP_GET_WTIME() +!$ wt = omp_get_wtime() !$ END IF #endif END FUNCTION m_walltime diff --git a/src/base/dbcsr_machine_posix.f90 b/src/base/dbcsr_machine_posix.f90 index cf40fc13d88..0d66707e7a8 100644 --- a/src/base/dbcsr_machine_posix.f90 +++ b/src/base/dbcsr_machine_posix.f90 @@ -17,7 +17,7 @@ PRIVATE PUBLIC :: m_flush, m_memory, & - m_hostnm, m_getcwd, m_getlog, m_getuid, m_getpid, m_getarg, & + m_hostnm, m_getcwd, m_getlog, m_getpid, m_getarg, & m_iargc, m_abort, m_chdir, m_mov, & m_memory_details, m_procrun @@ -325,20 +325,6 @@ SUBROUTINE m_getlog(user) END SUBROUTINE m_getlog -! ***************************************************************************** - SUBROUTINE m_getuid(uid) - INTEGER, INTENT(OUT) :: uid - - INTERFACE - FUNCTION getuid() BIND(C, name="getuid") RESULT(uid) - IMPORT - INTEGER(KIND=C_INT) :: uid - END FUNCTION - END INTERFACE - - uid = getuid() - END SUBROUTINE m_getuid - ! ***************************************************************************** SUBROUTINE m_getpid(pid) INTEGER, INTENT(OUT) :: pid diff --git a/src/block/dbcsr_block_access.F b/src/block/dbcsr_block_access.F index 71cd2076b07..6fd7319deea 100644 --- a/src/block/dbcsr_block_access.F +++ b/src/block/dbcsr_block_access.F @@ -62,7 +62,7 @@ MODULE dbcsr_block_access real_8 #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads IMPLICIT NONE diff --git a/src/block/dbcsr_block_operations.F b/src/block/dbcsr_block_operations.F index fc7f8b51e4d..370327c20d5 100644 --- a/src/block/dbcsr_block_operations.F +++ b/src/block/dbcsr_block_operations.F @@ -30,7 +30,6 @@ MODULE dbcsr_block_operations sp #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads IMPLICIT NONE #if defined(__LIBXSMM) && TO_VERSION(1, 10) < TO_VERSION(LIBXSMM_CONFIG_VERSION_MAJOR, LIBXSMM_CONFIG_VERSION_MINOR) # define __LIBXSMM_BLOCKOPS diff --git a/src/block/dbcsr_iterator_operations.F b/src/block/dbcsr_iterator_operations.F index 64afa010657..49626f1a52d 100644 --- a/src/block/dbcsr_iterator_operations.F +++ b/src/block/dbcsr_iterator_operations.F @@ -31,7 +31,7 @@ MODULE dbcsr_iterator_operations real_8 #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, omp_in_parallel +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads, omp_in_parallel IMPLICIT NONE diff --git a/src/core/dbcsr_config.F b/src/core/dbcsr_config.F index 0655a1cc81a..c6d06708390 100644 --- a/src/core/dbcsr_config.F +++ b/src/core/dbcsr_config.F @@ -177,9 +177,7 @@ MODULE dbcsr_config SET_PARAMETER_DEFAULT(USE_MEMPOOLS_CPU, CONF_PAR_LOGICAL, .FALSE.) SET_PARAMETER_DEFAULT(USE_MPI_ALLOCATOR, CONF_PAR_LOGICAL, .FALSE.) SET_PARAMETER_DEFAULT(TAS_SPLIT_FACTOR, CONF_PAR_REAL, 1.0_real_8) -#if defined(__DBCSR_ACC_G2G) - SET_PARAMETER_DEFAULT(USE_ACC_G2G, CONF_PAR_LOGICAL, .TRUE.) -#endif + SET_PARAMETER_DEFAULT(USE_ACC_G2G, CONF_PAR_LOGICAL, .FALSE.) END TYPE dbcsr_config_type TYPE(dbcsr_config_type), PROTECTED, SAVE :: dbcsr_cfg = dbcsr_config_type() ! defaults @@ -414,11 +412,7 @@ SUBROUTINE dbcsr_set_config( & CALL dbcsr_cfg%accdrv_binning_binsize%set(accdrv_binning_binsize) CALL dbcsr_cfg%use_mempools_cpu%set(use_mempools_cpu) CALL dbcsr_cfg%tas_split_factor%set(tas_split_factor) -#if defined(__DBCSR_ACC_G2G) CALL dbcsr_cfg%use_acc_g2g%set(use_acc_g2g) -#else - MARK_USED(use_acc_g2g) -#endif IF (0 == nthreads) THEN nthreads = 1 @@ -517,11 +511,7 @@ SUBROUTINE dbcsr_get_default_config( & IF (PRESENT(use_mempools_cpu)) use_mempools_cpu = dbcsr_cfg%use_mempools_cpu%defval IF (PRESENT(nstacks)) nstacks = dbcsr_cfg%n_stacks%defval IF (PRESENT(tas_split_factor)) tas_split_factor = dbcsr_cfg%tas_split_factor%defval -#if defined(__DBCSR_ACC_G2G) IF (PRESENT(use_acc_g2g)) use_acc_g2g = dbcsr_cfg%use_acc_g2g%defval -#else - MARK_USED(use_acc_g2g) -#endif END SUBROUTINE dbcsr_get_default_config @@ -650,11 +640,9 @@ SUBROUTINE dbcsr_print_config(unit_nr) WRITE (UNIT=unit_nr, FMT='(1X,A,T70,I11,A4)') & "DBCSR| ACC: Min. flop for processing", dbcsr_cfg%accdrv_min_flop_process%val, & dbcsr_cfg%accdrv_min_flop_process%print_source() -#if defined(__DBCSR_ACC_G2G) WRITE (UNIT=unit_nr, FMT='(1X,A,T80,L1,A4)') & "DBCSR| ACC: Use G2G algorithm", dbcsr_cfg%use_acc_g2g%val, & dbcsr_cfg%use_acc_g2g%print_source() -#endif IF (dbcsr_cfg%accdrv_stack_sort%val) THEN WRITE (UNIT=unit_nr, FMT='(1X,A,T70,I11,A4)') & "DBCSR| ACC: Min. flop for sorting", dbcsr_cfg%accdrv_min_flop_sort%val, & diff --git a/src/core/dbcsr_lib.F b/src/core/dbcsr_lib.F index 401abe931da..3f0a44a7c41 100644 --- a/src/core/dbcsr_lib.F +++ b/src/core/dbcsr_lib.F @@ -56,8 +56,6 @@ MODULE dbcsr_lib #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads - #if defined (__DBCSR_ACC) USE ISO_C_BINDING, ONLY: C_INT #endif diff --git a/src/core/dbcsr_methods.F b/src/core/dbcsr_methods.F index 862076e3fb2..70e7b9c064e 100644 --- a/src/core/dbcsr_methods.F +++ b/src/core/dbcsr_methods.F @@ -27,7 +27,6 @@ MODULE dbcsr_methods dbcsr_type_real_8, dbcsr_type_symmetric, dbcsr_work_type #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads IMPLICIT NONE PRIVATE diff --git a/src/core/dbcsr_types.F b/src/core/dbcsr_types.F index 0c175889be8..fb232079d51 100644 --- a/src/core/dbcsr_types.F +++ b/src/core/dbcsr_types.F @@ -24,8 +24,6 @@ MODULE dbcsr_types int_8 USE dbcsr_mpiwrap, ONLY: mp_comm_type, mp_comm_null -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/data/dbcsr_data_methods_low.F b/src/data/dbcsr_data_methods_low.F index 8e89bb267a8..041a6ef3edb 100644 --- a/src/data/dbcsr_data_methods_low.F +++ b/src/data/dbcsr_data_methods_low.F @@ -30,8 +30,6 @@ MODULE dbcsr_data_methods_low real_8 #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/data/dbcsr_data_operations.F b/src/data/dbcsr_data_operations.F index 66e8f9c1bea..e5e9e1ba079 100644 --- a/src/data/dbcsr_data_operations.F +++ b/src/data/dbcsr_data_operations.F @@ -30,8 +30,6 @@ MODULE dbcsr_data_operations dbcsr_type_real_8 #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/data/dbcsr_data_types.F b/src/data/dbcsr_data_types.F index cef186a0512..da18b6c4407 100644 --- a/src/data/dbcsr_data_types.F +++ b/src/data/dbcsr_data_types.F @@ -15,7 +15,7 @@ MODULE dbcsr_data_types USE dbcsr_kinds, ONLY: & dp, int_4, int_4_size, int_8, int_8_size, real_4, real_4_size, real_8, real_8_size -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, omp_lock_kind +!$ USE OMP_LIB, ONLY: omp_lock_kind #include "base/dbcsr_base_uses.f90" diff --git a/src/data/dbcsr_mem_methods.F b/src/data/dbcsr_mem_methods.F index 66abf41f35a..a28a8fbef79 100644 --- a/src/data/dbcsr_mem_methods.F +++ b/src/data/dbcsr_mem_methods.F @@ -22,8 +22,7 @@ MODULE dbcsr_mem_methods dbcsr_memtype_type USE dbcsr_kinds, ONLY: dp -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, & -!$ omp_set_lock, omp_unset_lock, omp_init_lock, omp_lock_kind, omp_destroy_lock +!$ USE OMP_LIB, ONLY: omp_set_lock, omp_unset_lock, omp_init_lock, omp_destroy_lock #include "base/dbcsr_base_uses.f90" diff --git a/src/data/dbcsr_ptr_util.F b/src/data/dbcsr_ptr_util.F index eb9d25da071..030eaa16cf5 100644 --- a/src/data/dbcsr_ptr_util.F +++ b/src/data/dbcsr_ptr_util.F @@ -28,8 +28,6 @@ MODULE dbcsr_ptr_util mp_deallocate #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE @@ -294,15 +292,9 @@ SUBROUTINE mem_copy_${nametype1}$ (dst, src, n) !! length of copy ${type1}$, DIMENSION(1:n), INTENT(OUT) :: dst !! destination memory - ${type1}$, DIMENSION(1:n), INTENT(IN) :: src + ${type1}$, DIMENSION(1:n), INTENT(IN) :: src !! source memory -#if !defined(__DBCSR_DISABLE_WORKSHARE) -!$OMP PARALLEL WORKSHARE DEFAULT(none) SHARED(dst,src) -#endif dst(:) = src(:) -#if !defined(__DBCSR_DISABLE_WORKSHARE) -!$OMP END PARALLEL WORKSHARE -#endif END SUBROUTINE mem_copy_${nametype1}$ SUBROUTINE mem_zero_${nametype1}$ (dst, n) @@ -312,13 +304,7 @@ SUBROUTINE mem_zero_${nametype1}$ (dst, n) !! length of elements to zero ${type1}$, DIMENSION(1:n), INTENT(OUT) :: dst !! destination memory -#if !defined(__DBCSR_DISABLE_WORKSHARE) -!$OMP PARALLEL WORKSHARE DEFAULT(none) SHARED(dst) -#endif dst(:) = ${zero1}$ -#if !defined(__DBCSR_DISABLE_WORKSHARE) -!$OMP END PARALLEL WORKSHARE -#endif END SUBROUTINE mem_zero_${nametype1}$ SUBROUTINE mem_alloc_${nametype1}$ (mem, n, mem_type) diff --git a/src/dbcsr_api.F b/src/dbcsr_api.F index d28060357ea..a0a3faa4d10 100644 --- a/src/dbcsr_api.F +++ b/src/dbcsr_api.F @@ -140,7 +140,6 @@ MODULE dbcsr_api real_4, & real_8 -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads #include "base/dbcsr_base_uses.f90" IMPLICIT NONE diff --git a/src/dist/dbcsr_dist_methods.F b/src/dist/dbcsr_dist_methods.F index 8a44ac20f76..68b2ac0a38c 100644 --- a/src/dist/dbcsr_dist_methods.F +++ b/src/dist/dbcsr_dist_methods.F @@ -39,7 +39,7 @@ MODULE dbcsr_dist_methods dbcsr_mp_obj #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, OMP_IN_PARALLEL +!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_num_threads, omp_in_parallel IMPLICIT NONE PRIVATE @@ -467,7 +467,7 @@ SUBROUTINE dbcsr_distribution_make_threads(dist, row_sizes) ! --------------------------------------------------------------------------- dist_p => dist -!$ IF (.NOT. OMP_IN_PARALLEL()) THEN +!$ IF (.NOT. omp_in_parallel()) THEN ! GCC 10.2 refused to build with DEFAULT(NONE) SHARED(dist_p, row_sizes) here: !$OMP PARALLEL DEFAULT(SHARED) !$ CALL make_threads(dist_p, row_sizes=row_sizes) diff --git a/src/dist/dbcsr_dist_operations.F b/src/dist/dbcsr_dist_operations.F index 1a6972b5075..2ebf3fb65c1 100644 --- a/src/dist/dbcsr_dist_operations.F +++ b/src/dist/dbcsr_dist_operations.F @@ -36,8 +36,6 @@ MODULE dbcsr_dist_operations dbcsr_type #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/dist/dbcsr_dist_util.F b/src/dist/dbcsr_dist_util.F index 8070f2fd5a8..ea988a1fe99 100644 --- a/src/dist/dbcsr_dist_util.F +++ b/src/dist/dbcsr_dist_util.F @@ -47,7 +47,6 @@ MODULE dbcsr_dist_util dbcsr_type_complex_8, dbcsr_type_real_4, dbcsr_type_real_8 #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads IMPLICIT NONE PRIVATE diff --git a/src/mm/dbcsr_acc_operations.F b/src/mm/dbcsr_acc_operations.F index d9b9d10040f..776f7ada5f1 100644 --- a/src/mm/dbcsr_acc_operations.F +++ b/src/mm/dbcsr_acc_operations.F @@ -23,8 +23,6 @@ MODULE dbcsr_acc_operations USE dbcsr_kinds, ONLY: real_8, dp USE dbcsr_types, ONLY: dbcsr_type_real_8 -!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads - #include "base/dbcsr_base_uses.f90" IMPLICIT NONE diff --git a/src/mm/dbcsr_mm.F b/src/mm/dbcsr_mm.F index b3d475310e6..8dd9da1eaee 100644 --- a/src/mm/dbcsr_mm.F +++ b/src/mm/dbcsr_mm.F @@ -906,7 +906,6 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & flop=my_flop, keep_product_data=keep_product_data) ELSE data_type = dbcsr_get_data_type(product_matrix) -#if defined (__DBCSR_ACC_G2G) IF (data_type .NE. dbcsr_type_real_8 .OR. (.NOT. dbcsr_cfg%use_acc_g2g%val)) THEN ! If G2G is enabled, norms have to be calculated on the GPU. ! Since the norms kernel expects only real_8 type data, we @@ -921,12 +920,6 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & filter_eps=filter_eps, & flop=my_flop, keep_product_data=keep_product_data) END IF -#else - CALL multiply_cannon(m2s_left, m2s_right, product_matrix, & - retain_sparsity=retain_sparsity, & - filter_eps=filter_eps, & - flop=my_flop, keep_product_data=keep_product_data) -#endif CALL dbcsr_finalize(product_matrix, reshuffle=PRESENT(filter_eps) .AND. .NOT. keep_sparsity) END IF ! diff --git a/src/mm/dbcsr_mm_3d.F b/src/mm/dbcsr_mm_3d.F index b25b360abd7..60f46754518 100644 --- a/src/mm/dbcsr_mm_3d.F +++ b/src/mm/dbcsr_mm_3d.F @@ -107,8 +107,9 @@ MODULE dbcsr_mm_3d dbcsr_work_destroy #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, & -!$ omp_set_lock, omp_unset_lock, omp_init_lock, omp_lock_kind, omp_destroy_lock +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads, & +!$ omp_set_lock, omp_unset_lock, omp_init_lock, & +!$ omp_lock_kind, omp_destroy_lock IMPLICIT NONE diff --git a/src/mm/dbcsr_mm_accdrv.F b/src/mm/dbcsr_mm_accdrv.F index 5a4cc28f05b..8c69ad07dd0 100644 --- a/src/mm/dbcsr_mm_accdrv.F +++ b/src/mm/dbcsr_mm_accdrv.F @@ -531,6 +531,9 @@ SUBROUTINE dbcsr_mm_accdrv_process(this, left, right, params, stack_size, & IF (success) THEN CALL acc_event_record(stackbuf%calculated, stream=stackbuf%stream) ELSE + IF (dbcsr_cfg%use_acc_g2g%val) THEN + DBCSR_ABORT("MPI G2G requires all kernels to be evaluated on the GPU!") + END IF this%do_gpu_c_redux = .TRUE. END IF diff --git a/src/mm/dbcsr_mm_cannon.F b/src/mm/dbcsr_mm_cannon.F index 53b1f7faf7c..71357f53ded 100644 --- a/src/mm/dbcsr_mm_cannon.F +++ b/src/mm/dbcsr_mm_cannon.F @@ -123,7 +123,7 @@ MODULE dbcsr_mm_cannon #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads IMPLICIT NONE diff --git a/src/mm/dbcsr_mm_csr.F b/src/mm/dbcsr_mm_csr.F index 0cf05b0a7e3..f84692b747f 100644 --- a/src/mm/dbcsr_mm_csr.F +++ b/src/mm/dbcsr_mm_csr.F @@ -45,7 +45,7 @@ MODULE dbcsr_mm_csr dbcsr_work_type #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads IMPLICIT NONE diff --git a/src/mm/dbcsr_mm_dist_operations.F b/src/mm/dbcsr_mm_dist_operations.F index 04159a34a83..7095bd18cd1 100644 --- a/src/mm/dbcsr_mm_dist_operations.F +++ b/src/mm/dbcsr_mm_dist_operations.F @@ -35,8 +35,6 @@ MODULE dbcsr_mm_dist_operations dbcsr_slot_nblkcols_local, dbcsr_slot_nblkrows_local, dbcsr_type #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/mm/dbcsr_mm_hostdrv.F b/src/mm/dbcsr_mm_hostdrv.F index fa33c90021a..99eee4e3641 100644 --- a/src/mm/dbcsr_mm_hostdrv.F +++ b/src/mm/dbcsr_mm_hostdrv.F @@ -38,8 +38,6 @@ MODULE dbcsr_mm_hostdrv sp #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/mm/dbcsr_mm_multrec.F b/src/mm/dbcsr_mm_multrec.F index 161b0b69f6a..2d3152dab1a 100644 --- a/src/mm/dbcsr_mm_multrec.F +++ b/src/mm/dbcsr_mm_multrec.F @@ -43,7 +43,7 @@ MODULE dbcsr_mm_multrec sp #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads IMPLICIT NONE diff --git a/src/mm/dbcsr_mm_sched.F b/src/mm/dbcsr_mm_sched.F index db68526e398..19fd41de289 100644 --- a/src/mm/dbcsr_mm_sched.F +++ b/src/mm/dbcsr_mm_sched.F @@ -49,7 +49,7 @@ MODULE dbcsr_mm_sched dbcsr_work_type #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads IMPLICIT NONE diff --git a/src/mm/dbcsr_multiply_api.F b/src/mm/dbcsr_multiply_api.F index 44c0806bb99..ff0b95e2f5c 100644 --- a/src/mm/dbcsr_multiply_api.F +++ b/src/mm/dbcsr_multiply_api.F @@ -18,8 +18,6 @@ MODULE dbcsr_multiply_api dbcsr_type_real_4, & dbcsr_type_real_8 -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - #include "base/dbcsr_base_uses.f90" IMPLICIT NONE diff --git a/src/mpi/dbcsr_mp_methods.F b/src/mpi/dbcsr_mp_methods.F index 413e9afbf4c..7ef89d040fc 100644 --- a/src/mpi/dbcsr_mp_methods.F +++ b/src/mpi/dbcsr_mp_methods.F @@ -19,8 +19,6 @@ MODULE dbcsr_mp_methods mp_comm_null, mp_comm_type USE dbcsr_types, ONLY: dbcsr_mp_obj -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - #include "base/dbcsr_base_uses.f90" IMPLICIT NONE diff --git a/src/mpi/dbcsr_mpiwrap.F b/src/mpi/dbcsr_mpiwrap.F index f5393630eb9..a82edf17251 100644 --- a/src/mpi/dbcsr_mpiwrap.F +++ b/src/mpi/dbcsr_mpiwrap.F @@ -5182,13 +5182,7 @@ SUBROUTINE mp_rget_${nametype1}$v(base, source, win, win_data, myproc, disp, req MARK_USED(myproc) #endif IF (do_local_copy) THEN -#if !defined(__DBCSR_DISABLE_WORKSHARE) -!$OMP PARALLEL WORKSHARE DEFAULT(none) SHARED(base,win_data,disp_aint,len) -#endif base(:) = win_data(disp_aint + 1:disp_aint + len) -#if !defined(__DBCSR_DISABLE_WORKSHARE) -!$OMP END PARALLEL WORKSHARE -#endif request = mp_request_null ierr = 0 ELSE diff --git a/src/ops/dbcsr_io.F b/src/ops/dbcsr_io.F index c5920543788..c024bb930bd 100644 --- a/src/ops/dbcsr_io.F +++ b/src/ops/dbcsr_io.F @@ -48,8 +48,6 @@ MODULE dbcsr_io USE dbcsr_work_operations, ONLY: dbcsr_create #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/ops/dbcsr_operations.F b/src/ops/dbcsr_operations.F index e0a59a92e48..44112b8cb23 100644 --- a/src/ops/dbcsr_operations.F +++ b/src/ops/dbcsr_operations.F @@ -94,7 +94,7 @@ MODULE dbcsr_operations mp_sum #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads +!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads IMPLICIT NONE @@ -316,7 +316,6 @@ SUBROUTINE dbcsr_zero(matrix_a) CALL timeset(routineN, handle) SELECT CASE (dbcsr_get_data_type(matrix_a)) -#if defined(__DBCSR_DISABLE_WORKSHARE) CASE (dbcsr_type_complex_4) matrix_a%data_area%d%c_sp = (0.0, 0.0) CASE (dbcsr_type_complex_8) @@ -325,24 +324,6 @@ SUBROUTINE dbcsr_zero(matrix_a) matrix_a%data_area%d%r_sp = 0.0 CASE (dbcsr_type_real_8) matrix_a%data_area%d%r_dp = 0.0_dp -#else - CASE (dbcsr_type_complex_4) -!$OMP PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a) - matrix_a%data_area%d%c_sp = (0.0, 0.0) -!$OMP END PARALLEL WORKSHARE - CASE (dbcsr_type_complex_8) -!$OMP PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a) - matrix_a%data_area%d%c_dp = (0.0_dp, 0.0_dp) -!$OMP END PARALLEL WORKSHARE - CASE (dbcsr_type_real_4) -!$OMP PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a) - matrix_a%data_area%d%r_sp = 0.0 -!$OMP END PARALLEL WORKSHARE - CASE (dbcsr_type_real_8) -!$OMP PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a) - matrix_a%data_area%d%r_dp = 0.0_dp -!$OMP END PARALLEL WORKSHARE -#endif END SELECT CALL timestop(handle) END SUBROUTINE dbcsr_zero diff --git a/src/ops/dbcsr_test_methods.F b/src/ops/dbcsr_test_methods.F index bc081615913..ef7e5cd3528 100644 --- a/src/ops/dbcsr_test_methods.F +++ b/src/ops/dbcsr_test_methods.F @@ -60,8 +60,6 @@ MODULE dbcsr_test_methods dbcsr_work_create #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/ops/dbcsr_tests.F b/src/ops/dbcsr_tests.F index dc498862459..c3f69ff8012 100644 --- a/src/ops/dbcsr_tests.F +++ b/src/ops/dbcsr_tests.F @@ -55,8 +55,6 @@ MODULE dbcsr_tests USE dbcsr_work_operations, ONLY: dbcsr_create #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/ops/dbcsr_transformations.F b/src/ops/dbcsr_transformations.F index ff4e060cf00..e6fc2a17806 100644 --- a/src/ops/dbcsr_transformations.F +++ b/src/ops/dbcsr_transformations.F @@ -91,8 +91,6 @@ MODULE dbcsr_transformations dbcsr_work_create #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/tensors/dbcsr_array_list_methods.F b/src/tensors/dbcsr_array_list_methods.F index a80277abf77..1077e172488 100644 --- a/src/tensors/dbcsr_array_list_methods.F +++ b/src/tensors/dbcsr_array_list_methods.F @@ -19,6 +19,13 @@ MODULE dbcsr_array_list_methods USE dbcsr_allocate_wrap, ONLY: allocate_any #include "base/dbcsr_base_uses.f90" +#if TO_VERSION(1, 11) <= TO_VERSION(LIBXSMM_CONFIG_VERSION_MAJOR, LIBXSMM_CONFIG_VERSION_MINOR) + USE libxsmm, ONLY: libxsmm_diff +# define PURE_ARRAY_EQ +#else +# define PURE_ARRAY_EQ PURE +#endif + IMPLICIT NONE PRIVATE CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_array_list_methods' @@ -275,15 +282,18 @@ FUNCTION check_equal(list1, list2) check_equal = array_eq_i(list1%col_data, list2%col_data) .AND. array_eq_i(list1%ptr, list2%ptr) END FUNCTION - PURE FUNCTION array_eq_i(arr1, arr2) + PURE_ARRAY_EQ FUNCTION array_eq_i(arr1, arr2) !! check whether two arrays are equal INTEGER, INTENT(IN), DIMENSION(:) :: arr1 INTEGER, INTENT(IN), DIMENSION(:) :: arr2 LOGICAL :: array_eq_i +#if TO_VERSION(1, 11) <= TO_VERSION(LIBXSMM_CONFIG_VERSION_MAJOR, LIBXSMM_CONFIG_VERSION_MINOR) + array_eq_i = .NOT. libxsmm_diff(arr1, arr2) +#else array_eq_i = .FALSE. IF (SIZE(arr1) .EQ. SIZE(arr2)) array_eq_i = ALL(arr1 == arr2) - +#endif END FUNCTION END MODULE dbcsr_array_list_methods diff --git a/src/tensors/dbcsr_tensor_types.F b/src/tensors/dbcsr_tensor_types.F index c8eb1953355..17147fb49fa 100644 --- a/src/tensors/dbcsr_tensor_types.F +++ b/src/tensors/dbcsr_tensor_types.F @@ -698,18 +698,6 @@ SUBROUTINE dbcsr_t_distribution_new_expert(dist, pgrid, map1_2d, map2_2d, ${varl ALLOCATE (dist%refcount) dist%refcount = 1 CALL timestop(handle) - - CONTAINS - PURE FUNCTION array_eq_i(arr1, arr2) - INTEGER, INTENT(IN), DIMENSION(:) :: arr1 - INTEGER, INTENT(IN), DIMENSION(:) :: arr2 - LOGICAL :: array_eq_i - - array_eq_i = .FALSE. - IF (SIZE(arr1) .EQ. SIZE(arr2)) array_eq_i = ALL(arr1 == arr2) - - END FUNCTION - END SUBROUTINE SUBROUTINE dbcsr_t_distribution_destroy(dist) diff --git a/src/utils/dbcsr_toollib.F b/src/utils/dbcsr_toollib.F index 5e6da8cf939..e084f683819 100644 --- a/src/utils/dbcsr_toollib.F +++ b/src/utils/dbcsr_toollib.F @@ -19,8 +19,6 @@ MODULE dbcsr_toollib real_8 #include "base/dbcsr_base_uses.f90" -!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads - IMPLICIT NONE PRIVATE diff --git a/src/work/dbcsr_work_operations.F b/src/work/dbcsr_work_operations.F index e55d0efae2e..faff0aee208 100644 --- a/src/work/dbcsr_work_operations.F +++ b/src/work/dbcsr_work_operations.F @@ -916,7 +916,7 @@ SUBROUTINE dbcsr_finalize(matrix, reshuffle) ! built/modified in a parallel environment nwms = SIZE(matrix%wms) spawn = .FALSE. -!$ IF (.NOT. OMP_IN_PARALLEL()) THEN +!$ IF (.NOT. omp_in_parallel()) THEN !$ IF (nwms .GT. 1) spawn = .TRUE. !$ END IF IF (spawn) THEN diff --git a/tools/docker/Dockerfile.build-env-ubuntu b/tools/docker/Dockerfile.build-env-ubuntu index 73c935dec18..0ace1bd728a 100644 --- a/tools/docker/Dockerfile.build-env-ubuntu +++ b/tools/docker/Dockerfile.build-env-ubuntu @@ -55,12 +55,12 @@ RUN set -ex ; \ git-archive-all \ ; -ARG libxsmm_version=1.17 +ARG libxsmm_version=488aa88f2a9825e9f92a0cfc773c1aedf019f88a RUN set -ex ; \ - curl -LsS https://github.com/hfp/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \ + curl -LsS https://github.com/libxsmm/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \ ln -s libxsmm-${libxsmm_version} /opt/libxsmm ; \ - make -j -C /opt/libxsmm MALLOC=0 + make -j -C /opt/libxsmm WRAP=0 ENV PKG_CONFIG_PATH="/opt/libxsmm/lib:${PKG_CONFIG_PATH}" diff --git a/tools/docker/Dockerfile.build-env-ubuntu-cuda b/tools/docker/Dockerfile.build-env-ubuntu-cuda index bdcc7bc109d..5dadec16251 100644 --- a/tools/docker/Dockerfile.build-env-ubuntu-cuda +++ b/tools/docker/Dockerfile.build-env-ubuntu-cuda @@ -46,12 +46,12 @@ RUN set -ex ; \ git-archive-all \ ; -ARG libxsmm_version=1.17 +ARG libxsmm_version=488aa88f2a9825e9f92a0cfc773c1aedf019f88a RUN set -ex ; \ - curl -LsS https://github.com/hfp/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \ + curl -LsS https://github.com/libxsmm/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \ ln -s libxsmm-${libxsmm_version} /opt/libxsmm ; \ - make -j -C /opt/libxsmm MALLOC=0 + make -j -C /opt/libxsmm WRAP=0 ENV PKG_CONFIG_PATH="/opt/libxsmm/lib:${PKG_CONFIG_PATH}"