diff --git a/.ci/daint.cscs.ch/ocl.build.sh b/.ci/daint.cscs.ch/ocl.build.sh
index f1f0fa4c105..01040f5932e 100755
--- a/.ci/daint.cscs.ch/ocl.build.sh
+++ b/.ci/daint.cscs.ch/ocl.build.sh
@@ -35,7 +35,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
 fi
 cd "${HOME}/libxsmm"
 git fetch
-git checkout d009b33e8742a93c9e1549323587fb6197451294
+git checkout 488aa88f2a9825e9f92a0cfc773c1aedf019f88a
 make -j
 cd ..
 
diff --git a/.github/workflows/testing-linux.yml b/.github/workflows/testing-linux.yml
index d9ef4fa8fac..ab5f5d1b5e2 100644
--- a/.github/workflows/testing-linux.yml
+++ b/.github/workflows/testing-linux.yml
@@ -74,16 +74,16 @@ jobs:
         mv build/coverage.info build/coverage-Linux-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-cpu.info
 
     - name: Upload coverage data
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
-        name: coverage-data
+        name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }}
         path: build/coverage-*.info
 
     - name: Upload coverage data (generated files)
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       if: matrix.use_mpi == 'MPI=ON' && matrix.use_openmp == 'OPENMP=ON' && matrix.use_smm == 'SMM=blas' && matrix.mpi_suffix == 'openmpi'
       with:
-        name: coverage-data
+        name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }}-generated-files
         path: |
           build/src/dbcsr.h
           build/src/tensors/dbcsr_tensor.h
@@ -200,9 +200,10 @@ jobs:
       - uses: actions/checkout@v4
 
       - name: Download coverage data
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4.1.7
         with:
-          name: coverage-data
+          pattern: coverage-data-*
+          merge-multiple: true
 
       - name: Combine coverage
         run: |
@@ -213,7 +214,7 @@ jobs:
           lcov --summary merged.info
 
       - name: Upload merged HTML report
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: html-report
           path: htmlcov
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c0c79251adc..6b71c63041a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ fail_fast: false
 minimum_pre_commit_version: 3.2.0
 repos:
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: 'v0.5.4'
+  rev: 'v0.8.2'
   hooks:
   - id: ruff
     args: [ --fix, --exit-non-zero-on-fix ]
@@ -15,19 +15,19 @@ repos:
         .cp2k/.*|
       )$
 - repo: https://github.com/psf/black
-  rev: 24.4.2
+  rev: 24.10.0
   hooks:
   - id: black
     name: Reformat Python files with the black code formatter
     files: '^.*(/PACKAGE)|(\.py)$'
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v4.6.0
+  rev: v5.0.0
   hooks:
   - id: check-ast
   - id: check-yaml
   - id: check-symlinks
   - id: trailing-whitespace
-- repo: https://github.com/pseewald/fprettify
+- repo: https://github.com/fortran-lang/fprettify
   rev: v0.3.7
   hooks:
   - id: fprettify
@@ -65,4 +65,5 @@ repos:
     language: python
     files: \.(c|cc|cxx|cpp|cl|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|mm|proto|textproto|vert)$
     args: ['-i', '-fallback-style=none', '--style=file']
-    additional_dependencies: ['clang-format']
+    # specify version since clang-format is not stable version-to-version
+    additional_dependencies: ['clang-format~=19.1.0']
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6b3f9f569d0..e819bbc23a0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,13 +120,7 @@ set_property(CACHE WITH_GPU PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES}
 
 option(WITH_CUDA_PROFILING "Enable profiling within CUDA" OFF)
 option(WITH_HIP_PROFILING "Enable profiling within HIP" OFF)
-option(WITH_G2G "Enable GPU aware MPI within CUDA/HIP backends" OFF)
 
-if (WITH_G2G AND ((NOT USE_ACCEL) OR ((NOT USE_ACCEL MATCHES "cuda")
-                                      AND (NOT USE_ACCEL MATCHES "hip"))))
-  message(
-    FATAL_ERROR "GPU aware MPI can only be enabled for HIP/CUDA GPU backends")
-endif ()
 # =================================================================================================
 # LANGUAGES AND TESTING
 enable_language(Fortran)
@@ -274,7 +268,6 @@ if (USE_ACCEL MATCHES "cuda")
   message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS})
   message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER})
   message(STATUS "GPU profiling enabled: " ${WITH_CUDA_PROFILING})
-  message(STATUS "GPU aware MPI enabled: " ${WITH_G2G})
 endif ()
 
 if (USE_ACCEL MATCHES "hip")
@@ -319,7 +312,6 @@ if (USE_ACCEL MATCHES "hip")
   message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS})
   message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER})
   message(STATUS "GPU profiling enabled: " ${WITH_HIP_PROFILING})
-  message(STATUS "GPU aware MPI enabled: " ${WITH_G2G})
 
   # =================================== BLAS on GPU backend
   find_package(hipblas CONFIG REQUIRED HINTS ${ROCM_PATH})
diff --git a/VERSION b/VERSION
index 41222dc7221..71bea4b7ee9 100644
--- a/VERSION
+++ b/VERSION
@@ -1,8 +1,8 @@
 MAJOR = 2
-MINOR = 7
+MINOR = 8
 PATCH = 0
 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
 # it is considered Development version.
-DATE  = 2024-07-29
+DATE  = 2024-12-11
 
 
diff --git a/cmake/CompilerConfiguration.cmake b/cmake/CompilerConfiguration.cmake
index b2b68dc8d8e..5d418744386 100644
--- a/cmake/CompilerConfiguration.cmake
+++ b/cmake/CompilerConfiguration.cmake
@@ -51,9 +51,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   if ((NOT (USE_MPI)) OR (NOT ("${MPI_Fortran_LIBRARY_VERSION_STRING}" MATCHES "Open MPI")))
     set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=leak")
   endif ()
-  if (USE_ACCEL MATCHES "hip" AND  hip_VERSION GREATER_EQUAL 6.0.0) # Remove deprecated function error with ROCm v6+
-     set(CMAKE_CXX_FLAGS            "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
-  endif	()
 elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
   set(CMAKE_CXX_FLAGS_RELEASE      "-O3 -funroll-loops")
   set(CMAKE_CXX_FLAGS_COVERAGE     "-O0 -g --coverage")
diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md
index 1c091558026..a3f1d362052 100644
--- a/docs/guide/2-user-guide/1-installation/index.md
+++ b/docs/guide/2-user-guide/1-installation/index.md
@@ -70,7 +70,6 @@ make
 -DUSE_ACCEL=<opencl|cuda|hip>
 -DWITH_CUDA_PROFILING=<OFF|ON>
 -DWITH_HIP_PROFILING=<OFF|ON>
--DWITH_G2G=<OFF|ON>
 -DWITH_C_API=<ON|OFF>
 -DWITH_EXAMPLES=<ON|OFF>
 -DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50|Mi100|Mi250>
diff --git a/docs/guide/3-developer-guide/3-programming/1-overview/index.md b/docs/guide/3-developer-guide/3-programming/1-overview/index.md
index 27f6bda40d0..d55b9b3f30f 100644
--- a/docs/guide/3-developer-guide/3-programming/1-overview/index.md
+++ b/docs/guide/3-developer-guide/3-programming/1-overview/index.md
@@ -55,4 +55,3 @@ Assumed square matrix with 20x20 matrix with 5x5 blocks and a 2x2 processor grid
 | `__CUDA_PROFILING`  | To turn on Nvidia Tools Extensions. It requires to link `-lnvToolsExt` | Fortran, C, C++ |
 | `__CUDA` | Enable CUDA acceleration | C, C++ |
 | `__HIP`  | Enable HIP acceleration | C, C++ |
-| `__DBCSR_ACC_G2G` | Enable GPU Aware MPI in CUDA and HIP backends | Fortran, C, C++ |
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md
index 44729500ba9..9e54d016be7 100644
--- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md
+++ b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/2-parameters.md
@@ -14,9 +14,3 @@ The batched matrix-matrix multiplication kernels are templated on:
 The batched transpose kernels are templated on:
 
 * the characteristic dimensions of the transpose: `m, n`
-
-## Predictive parameters
-
-The input features for the predictive models can be 'raw' parameters (left-most-column in the figure below), or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations).
-
-![libsmm_acc_predictive_modeling_features](../../../../../media/images/libsmm_acc_predictive_modeling_features.png)
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/4-predict.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/4-predict.md
deleted file mode 100644
index ce0ff183dd0..00000000000
--- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/4-predict.md
+++ /dev/null
@@ -1,3 +0,0 @@
-title: Predictive Modeling Framework
-
-{!./src/acc/libsmm_acc/predict/README.md!}
diff --git a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/5-notebooks.md b/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/5-notebooks.md
deleted file mode 100644
index f450b61b1ad..00000000000
--- a/docs/guide/3-developer-guide/3-programming/2-accelerator-backend/2-libsmm_acc/5-notebooks.md
+++ /dev/null
@@ -1,3 +0,0 @@
-title: Notebooks
-
-{!./src/acc/libsmm_acc/notebooks/README.md!}
diff --git a/docs/media/images/README.md b/docs/media/images/README.md
deleted file mode 100644
index cf42b988ac8..00000000000
--- a/docs/media/images/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-#### libsmm_acc predictive_modelling_features
-
-The XML file `libsmm_acc_predictive_modelling_features.xml` can be opened in [www.draw.io](www.draw.io) to be edited.
diff --git a/docs/media/images/libsmm_acc_predictive_modeling_features.png b/docs/media/images/libsmm_acc_predictive_modeling_features.png
deleted file mode 100644
index 69df06ea0eb..00000000000
Binary files a/docs/media/images/libsmm_acc_predictive_modeling_features.png and /dev/null differ
diff --git a/docs/media/images/libsmm_acc_predictive_modeling_features.xml b/docs/media/images/libsmm_acc_predictive_modeling_features.xml
deleted file mode 100644
index 1a3b13b6908..00000000000
--- a/docs/media/images/libsmm_acc_predictive_modeling_features.xml
+++ /dev/null
@@ -1 +0,0 @@
-<mxfile modified="2019-01-14T14:46:28.021Z" host="www.draw.io" agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Iridium/2017.11 Safari/537.36 Chrome/62.0.3202.94" etag="OS6S2zdFp7nt5fg3MjB3" version="10.0.36" type="device"><diagram id="759781cc-e7b8-cac5-3e35-2364a77c4e3a" name="Page-1">7Z1tc5s6FoB/TWZ270w7IN4/Nmncnbnt3c62u529XzzYEIcNBi+GJumvvxIgDOiA7QRhBx/nQxIZY0DPOdJ50dGVdrN++pS4m/svseeHV0Txnq60j1eEEMew6C/W8ly0qCZxipZVEnhl267hW/DLLxuVsjULPH/bODCN4zANNs3GZRxF/jJttLlJEj82D7uLw+a3btyVLzR8W7qh2Poj8NL7otUm1q79H36wuuffrJrl/S3c5cMqibOo/L4ros3yV/H22uXnKm90e+968WOtSbu90m6SOE6Lv9ZPN37IHi5/bMXnZh3vVted+FF6yAe+3mbZD33z9O1/f/56sG82vyJdf0eIUZznpxtm5RP54qZJ8MSumHbW9oqY9M5ob2nX5X2kz/zZbR+DdehG9L/ruyAMb+IwTvJ32JPQbzTavkpcL6BXWHvvLn+xz8RROnPXQcjYuYmzJPAT+gV/+I/lmyUtRCn/r51EyV+0PXQXfvjV9bwgWtG3WNM2cjff469xkD8YlbWUFzprXmV1Jds0iR986Prjn35yF+b9xm6xegr15152BT0y9Z9qTWU/fPLjtZ8mz/SQ8l1dJcVHSqFxSkQedwASxS7a7uvw8QPdEvpVdepdx9M/yr6HOfhP9ueN+vn3L7b9y7ve+N+jn873d0TVBA7Wv0W/PdDuD+m3Xy9oz5irFKKA3jRrv0/XYfm4E5+i4y7yA1iHbFhX5JdsXF8ZH2mLm6Xxtuhd9gE3DFYR/XtJnyllQLtmzzKgQvqhfGMdeB473fV24y5pT3+PN+W590NUXqkCQHUfJ8Ev2ubyK89puq4kmxMRxVHty/nJBJjIcgnB5JkL0zDZc2Hn9D0OJVcI7GSr0N1uy7+X8TpYlgeJtO2R44MhVLkGLiG0jfeGgKGtiBTqA0BoC7QxGuZLxG2yuDVpUzVTgM2RBJsFw7ZA2C4ENlC1yaLNhGlzkbYLoU0bj7WOOxAR7Ju613Bb3geh99l9jjN2kduUdh//r0DR/8qfn9pq+uI+lQ9dQIGeJ+GwNA/4xr4BhuD2w2x26wAWBCfoqCl9+aEB5u6q3tItFqBbiAF0uKpZ8rpcHOO4vmGSmtvP/BmZ/8/iQv1QK8o0qRVVa1Lu3KXfOKypCeqH7nDijey73hXa6AM9gD7qp/onCrX34CcR8yIUl0fvt7jCDp0Ig1pHStQ/fTiBXBzHWHWug1WKeTBl+yAy7AEYqqjf0cK09IZp5PMbk9pegM4xqmdQaA9fwshik4VmmiI3nuHbng4gQt9zLE+xrNaQc8y4dzA/1sH88HeJ9d7OaTF1QzVVPgKUSku3FUBrVRbh0MMUd4HVeEvvE9/1tnNxVrTxk/kijJdvw/WAcL4ATtVu0sj1UZ1FAim/AVhURdfDT+RqElxVPlM+EbdEv6omScWpjoDVI2I1TaxUaMovDSxxZp8GoT+PcHicKG+q3uSNAGpMlkNBFV0HOW1rZGuabJER2RJDzW64ipGsaZBltciCLExTEloiWQ+I1TSwcppYqcBgqEuiSkyJiM6Mqj2gHJVcc9FUgdpKFldE4Oocp1dnEQqs+Oomso88mWHCV0NnjcocAF0Q5X7W7RnCh0Pl692qhgY6+SHHqsRYtOhubQUm5QccP339N3vkSbxhDOYprVJDj2CIsSOWOCPsR2Swaj+Yl8NjjJwPW5xKwTHHIeLWGpBg6rIsY9GPlfgrerYiQjRfhBc7lTfzl8jGncF+YP20yH/G0E/2wbw9cYze29ruxYEtaTQJpKyg/NMhokCaLtCYU6e4IR0S34NYZlHA9FUOERI5BSLtXiJVootAgvpxCCBFF8ajm2z6iaRPO8pCNwnSZyRyEkRa/UTagIqEfGqDEAnk6K/9NepIJLLm5dV1aNiWxqQYnqLDNp3Qz7bn6EJB3o7nTdV6gdMUEDhFlgsYcJvkVgsbnJG7CXHXr+c040AtNwh0Ynpul6lcGcmFHw9xnAqOpIWj0UqZVMBlB1AC7yDjrpjdVswFEbipALdn3NVNe8xxl5+jARz90GqTIW/T4E3v5c1QId5kpSZpIm7lAgX6ycIPg8mWk8Rwj8NFG2/W13FH4sg7YvwMXjXoOIqie2KcjdGcZlG+ikxuhO249Xqzm1t9JiYRlB9qB974wQczd3TgTa28yPsib1V65qtC/yJCEb2tNx32b3Wab3qK50PqynFM5jUF1ZJnWJrjifrwZarHORgD/q7RTkISoJBVfgUqUsCWYs/fuuN2Clw49nvV3L0M0qDEBIJS0pKG4Mop8d2cldZCTE6MSVN7qIB5Jg0MuH4TBcOLM9bfyMZJ2TBbbJhisaUqkXqceSxg0FMC6KOnk1eiZFtWpZEof0uD6PnvLyw5aN5+nM3g7thVBdlTNaTHMqonidbtGLHaoFgzcBMn9HIiipBbHFbOPZvt/4pTNw3iiFMbPH10t/flf7tyk//M0jCIOg20m5tbI38OR2WAC9IlwvmCMod6A0PLFGc4mm6KGBqc31ctNwIyzpJszgibr92nOb01Nwz9cP4YJ286xayrMuZrrPyZeqsVELWEjLe/1JJ/Sap1pTsO13/cd1RVGBDBg/JgtQH0nwokTlTcBdG8ciwhcRMmzjah+Aw0HxuEOb1b1y2yu7dv1iFwwvS/BZxKQOKgiOAwxAGpOCVxLDQ4r5XxQe6mzB2vh16nDvJbDUEdfBMWYFuUKEaL8GGbs8jCh1OK4SCae9EkqgqpRCiAIxFOMZmHg0ifUVSteeHA5ollLWJZnkXtWMR22tga4EgO+WskYku6derbzvlBAoGVow0ANQXM9YEClBIB7LZoojeeBYQEAtVj9JYW7IBw5MEbWCZYQhgvlxkdk5dveuUVkrh3NNYMMXZCZPkO//vu+ofyPb4zt8/Pm8/Jz89Z9u1dlRhQo/D3OL0vOPyDGdu0hxLaXUjiOCSeOOQjCoKA9+Gy0S0IDTHQFUAMZE1JO8RAnBG0xGAVxgsUAhSC4YSgFcHU9VHd+h1yIE5KmnKAIoAiIE8EwMgWWKhDogiI1RJ2IrChXTD/HYUAhUCaEBjjBts6LhlaDC+mVG3XbhhKzak6LQBNoWhkZnWlb8nKmdqXXSbCeHROlWVZDRLBnCpFFzlUHUcSiJoixt9ybZyThwYq6uSDFPDhy2h4PbGmn0aDNPKogWhNEWN9bUFAExXFYFgxaO0+aYnrQ8aNeGtKh7tyJwUoACgA0gRAVQEJgOoUSJSADk9lIQFon6IMyJYBA5ABWUGrjnsQ1w5C1una94JsLdM8FXt7IPPvuLVEEmw/hyfpNGw/qJeNAdbTwN1si6oujF1vnkVJTHXdnbtM42SuorZDbbdXU7zU8jv1jJfYYkQGEAKCQjCSEIxF3sltLWKLgZBovn5g9G0x333yAEK2DrjBh0QCxShENH+IkMCpEqjuPK1nQ2EV8Whb3IV5gTEInIkOLAyVoW2JUwCoSqFE9DtyIeroY9QBwZc6D3HAcsSjLlnSFHEm0hYElAGUAVkyQLSDS3JLlAHR61qXAQw9oBRIlgILlAJt3NgDUIEHiD2EbrLyP17T27shmCA3kQQ5VXH2Vx3TefHEOo8mn0C8qi6KA1Z/ylGbf3VR9Z7zYsHji9g3w67gDEDauukqoxNEDe29iaFmGSdkjQA5vjvWlsjapFjT9FPqNQKk0VasNUOqOJ5OC7xdwSZeRhHawh7OcRmEPHIoeTi8Tpw8lUALCiSiB9a7gdDD0Xbq6JkjowdWuSnQC+N4M1+76ToLEbtpY0dHv9OXS3R67AxWtX0ZR8ssYU8D67ZfApM6xKQ2bik6p8ceYduP59U6i7LuyOO0eCRNGm1jzIG5g8ae0Rorb08ZxrZTBhyuR00+Ik6P1YIbD0wSQ2K0B2mNbwd2utVmTo/jJpp/vJ4HDBMEcUogtvUhgQrGSss8gAtTKWL0N8+/KUjENOTxWTy7HJyXlaeyjxEFXQWTcMat0aaI0em2KGBaMgqCXEEwQEGQ5dLqKFbYUR5rJwgoAygD8mQAzs2X5bTokIGOyliFDGBaMkqBZCkw4Oz8ca1UAwhudFXsvCL0S5RXlUepknYxORmQ51MkJzvt0hVcC9eYNPjEubEx9yAJo4a4VDDJiqpUFDRMM+jXx9btB/NW6ROzEd0llSo53G/XKhHFN3hobMYtK1XZBIMWFXmYZXAp1PFtvcYIWagmuBajog53gb8o9ih8osaT5QpQgX0/6+ydR3BMBmHHQ9SanZbE7OGpD+TGNLJp+rRl7DQkmiM6pVR4K8+KRMwZuBwVqEEqcNy1uTZgA9doTNdsWEYKJ03hGSwRt/uH5zRCDKeOoQ3mT42MYf/YjNmlF8SjBswKpXlkOu4B0IqAg5pts1b4p8d1VR9VcLvTf9ywDk7r/ZblyhaZPdpVrbdynzVFBdSlDvluqiIyrzJbCFBAvth7u8NhjYuBz1o9Hk3gIbNGTZrvUAPtlB78MJNp0vhxg2EM+OBbMI8l0p2ndFAKsQrzZZGqEmCcHncVpwkuC+lTnojqRaJqQgvqxoYVLKPVA+sSYb1EWIkCzUBHhtU4aFutmhHeZ6wXfw9QBhMzzY7LNEvdhB9h6FejpJ7ZTmvzMDHzTHPAfaOHsKYIOCFgyRfrOX0ay4emikVFes6K9AVbhjTQM8XloiNXdDDExKAIl4hOHjwVHMHHteINcbb5iY6Bmy2iN2n0dP3k6FlA+vchyw9q00ScH05kJYKuNqukG6Y4H9T5lLExH+RbDkrgU9zApRYML5aKLTEt47w1ZaVkXl70Szv1VkLEAmOPTRITJHFyJLZ2jhBV4tgcgq7JJodrylOeHYQzyEnTqOri0pmR04Os3vU0BY7fkcJJUwia0CNnTWqiIRO6WbS8L56BS7/GT0R12LON0wf2M3375NZkP2/BPjE1p8Ed4K4mkPIbxF1titO/aBE+THyAFVDaq9qu8x9RnPz8NbJq045Xba0EM8hDI6tmFp9H1hB7dJPNtlyJNfF1WJeOmgWhpsqyLSwx+BblsJ0WMhlMdWLzitWkhxAnYZXp8ZA59n5HiiyPM7Csb7uee5SeyRe7vHRlpnIToa7LZBmmgI/4MtwfF0+ZAS3XUwfaXpv+m8RxWnvvU+Ju7r/Ens+O+As=</diagram></mxfile>
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3f64deea382..31b89858369 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -284,18 +284,6 @@ if (USE_ACCEL)
             $<$<BOOL:${WITH_HIP_PROFILING}>:roctx64>
             $<$<BOOL:${WITH_HIP_PROFILING}>:roctracer64>
             $<$<STREQUAL:${USE_ACCEL},opencl>:OpenCL::OpenCL>)
-
-  if (WITH_G2G)
-    target_compile_definitions(
-      dbcsr
-      PRIVATE __DBCSR_ACC_G2G
-              $<$<STREQUAL:${USE_ACCEL},cuda>:__CUDA>
-              $<$<STREQUAL:${USE_ACCEL},cuda>:ARCH_NUMBER=${ACC_ARCH_NUMBER}>
-              $<$<STREQUAL:${USE_ACCEL},hip>:__HIP>
-              $<$<STREQUAL:${USE_ACCEL},hip>:ARCH_NUMBER=${ACC_ARCH_NUMBER}>
-              $<$<BOOL:${WITH_CUDA_PROFILING}>:__CUDA_PROFILING>
-              $<$<BOOL:${WITH_HIP_PROFILING}>:__HIP_PROFILING>)
-  endif ()
 endif ()
 
 # =================================================================================================
diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c
index 79bf0625f03..26bd167ca41 100644
--- a/src/acc/acc_bench_smm.c
+++ b/src/acc/acc_bench_smm.c
@@ -222,21 +222,25 @@ int main(int argc, char* argv[]) {
 #endif
   CHECK(libsmm_acc_init(), &result, check); /* note: libsmm_acc_init() may imply acc_init() */
   if (EXIT_SUCCESS == result) {
-    const char* const env_device = getenv("DEVICE");
-    const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
     int ndevices = 0;
     result = c_dbcsr_acc_get_ndevices(&ndevices);
-    if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) {
-      printf("Activated device%i (ndevices=%i)\n", device, ndevices);
-    }
-    else {
-      if (0 >= ndevices) {
-        fprintf(stderr, "ERROR: No ACC-device found!\n");
+    if (EXIT_SUCCESS == result && 0 < ndevices) {
+      const char* const env_device = getenv("DEVICE");
+      const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
+      const int rank = (NULL != env_rank ? atoi(env_rank) : -1);
+      int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
+      device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1);
+      result = c_dbcsr_acc_set_active_device(device);
+      if (EXIT_SUCCESS == result) {
+        printf("Activated device%i (ndevices=%i)\n", device, ndevices);
       }
       else {
-        fprintf(stderr, "ERROR: Failed to activate device %i of %i!\n", device, ndevices);
+        fprintf(stderr, "ERROR: Failed to activate device!\n");
       }
-      result = EXIT_FAILURE;
+    }
+    else {
+      fprintf(stderr, "ERROR: No ACC-device found!\n");
+      if (EXIT_SUCCESS == result) result = EXIT_FAILURE;
     }
     if (EXIT_SUCCESS == result) {
       rnd = (int*)malloc(sizeof(int) * NRAND);
@@ -280,7 +284,7 @@ int main(int argc, char* argv[]) {
 #if defined(USE_LIBXSMM)
       libxsmm_timer_tickint start;
       int print_offset = 0;
-      char print_buffer[1024];
+      char print_buffer[1024] = "";
 #  if defined(__OPENCL)
       const char* const env_smm_repeat = getenv("SMM_NREPEAT");
       const int smm_nrepeat = (NULL == env_smm_repeat ? 1 : MAX(atoi(env_smm_repeat), 1));
@@ -497,7 +501,7 @@ int main(int argc, char* argv[]) {
                 if (maxdiff < epsilon && NULL != file) maxdiff = epsilon;
                 if (0 < epsilon) {
                   if (LIBXSMM_NOTNAN(diff.v_tst)) {
-                    PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, fabs(diff.v_ref - diff.v_tst));
+                    PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, diff.linf_abs);
                   }
                   else {
                     PRINTF(" (%g)\n", diff.v_tst);
@@ -508,6 +512,7 @@ int main(int argc, char* argv[]) {
                 }
                 if (0 < check && check < epsilon) result = EXIT_FAILURE;
               }
+              else fprintf(stderr, "ERROR: failed to validate!\n");
             }
 #    endif
           }
diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c
index 07101a187df..2f9485b839c 100644
--- a/src/acc/acc_bench_trans.c
+++ b/src/acc/acc_bench_trans.c
@@ -106,52 +106,48 @@ int main(int argc, char* argv[]) {
 #else
   const int warmup = 0;
 #endif
-  const char* const env_device = getenv("DEVICE");
-  const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
   int *stack_hst = NULL, *stack_dev = NULL;
   ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL;
-  int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n;
+  int result = EXIT_SUCCESS, mm = m, nn = n, r, i;
   void* stream = NULL;
 #if defined(USE_LIBXSMM)
   libxsmm_timer_tickint start;
   double duration;
 #endif
   assert(m <= (mn / n) && 0 == (mn % n));
+  if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) {
+    fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n");
+    result = EXIT_FAILURE;
+  }
   CHECK(c_dbcsr_acc_init(), &result);
   /* note: libsmm_acc_init() may imply acc_init() */
   CHECK(libsmm_acc_init(), &result);
   if (EXIT_SUCCESS == result) {
+    int ndevices = 0;
     result = c_dbcsr_acc_get_ndevices(&ndevices);
-    if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) {
-      printf("Activated device%i (ndevices=%i)\n", device, ndevices);
-    }
-    else {
-      if (0 >= ndevices) {
-        fprintf(stderr, "No ACC-device found!\n");
+    if (EXIT_SUCCESS == result && 0 < ndevices) {
+      const char* const env_device = getenv("DEVICE");
+      const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
+      const int rank = (NULL != env_rank ? atoi(env_rank) : -1);
+      int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
+      device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1);
+      result = c_dbcsr_acc_set_active_device(device);
+      if (EXIT_SUCCESS == result) {
+        printf("Activated device%i (ndevices=%i)\n", device, ndevices);
+        printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
+        printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
       }
       else {
-        fprintf(stderr, "Failed to activate device %i of %i!\n", device, ndevices);
+        fprintf(stderr, "ERROR: Failed to activate device!\n");
       }
-#if !defined(__CUDA)
-      CHECK(libsmm_acc_finalize(), NULL);
-#endif
-      CHECK(c_dbcsr_acc_finalize(), NULL);
-      return result;
+    }
+    else {
+      fprintf(stderr, "ERROR: No ACC-device found!\n");
+      if (EXIT_SUCCESS == result) result = EXIT_FAILURE;
     }
   }
   else {
     fprintf(stderr, "ACC initialization failed!\n");
-#if !defined(__CUDA)
-    CHECK(libsmm_acc_finalize(), NULL);
-#endif
-    CHECK(c_dbcsr_acc_finalize(), NULL);
-    return result;
-  }
-  printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
-  printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
-  if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) {
-    fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n");
-    result = EXIT_FAILURE;
   }
 #if defined(PRIORITY)
   CHECK(c_dbcsr_acc_stream_priority_range(&priomin, &priomax), &result);
@@ -259,7 +255,7 @@ int main(int argc, char* argv[]) {
   CHECK(c_dbcsr_acc_finalize(), NULL);
   if (EXIT_SUCCESS != result) {
     if (-1 != result) {
-      fprintf(stderr, "FAILED\n");
+      fprintf(stderr, "\nFAILED\n\n");
     }
     else {
       fprintf(stderr, "Kernel not suitable!\n");
diff --git a/src/acc/acc_libsmm.h b/src/acc/acc_libsmm.h
index 06957d74074..93b1623f8bb 100644
--- a/src/acc/acc_libsmm.h
+++ b/src/acc/acc_libsmm.h
@@ -15,11 +15,11 @@
 #define DBCSR_TYPE_double dbcsr_type_real_8
 #define DBCSR_TYPE_float dbcsr_type_real_4
 
-#define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STRPTR ((const char**)&libsmm_acc_transpose_routine_name_ptr)
+#define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STRPTR ((const char**)((uintptr_t)&libsmm_acc_transpose_routine_name_ptr))
 #define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_LENPTR (&libsmm_acc_transpose_routine_name_len)
 #define LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STR (libsmm_acc_transpose_routine_name_str)
 
-#define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR ((const char**)&libsmm_acc_process_routine_name_ptr)
+#define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR ((const char**)((uintptr_t)&libsmm_acc_process_routine_name_ptr))
 #define LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR (&libsmm_acc_process_routine_name_len)
 #define LIBSMM_ACC_PROCESS_ROUTINE_NAME_STR (libsmm_acc_process_routine_name_str)
 
diff --git a/src/acc/cuda/Makefile b/src/acc/cuda/Makefile
index 2aedadeb979..6f6c66b2369 100644
--- a/src/acc/cuda/Makefile
+++ b/src/acc/cuda/Makefile
@@ -103,23 +103,15 @@ ifneq (,$(ELEM_TYPE))
   DFLAGS += -DELEM_TYPE=$(ELEM_TYPE)
 endif
 
-ifeq (1,$(INTEL))
-  CXX := icpc
-  CC := icc
-  AR := xiar
-else ifneq (0,$(INTEL))
-  CXX := icpx
-  CC := icx
-  AR := xiar
-else ifneq (0,$(GNU))
-  override CXX := g++
-  override CC := gcc
-  ifneq (Darwin,$(UNAME))
-    override AR := gcc-ar
+ifneq (0,$(INTEL))
+  ifneq (1,$(INTEL))
+    CXX := icpx
+    CC := icx
   else
-    override AR := ar
+    CXX := icpc
+    CC := icc
   endif
-  #override LD_LIBRARY_DIRS := $(NULL)
+  AR := $(if $(call which,xiar),xiar,ar)
 else
   CXX := g++
   CC := gcc
diff --git a/src/acc/cuda/dbcsr_cuda_profiling.F b/src/acc/cuda/dbcsr_cuda_profiling.F
index ea9bdf46b29..d9364381bb8 100644
--- a/src/acc/cuda/dbcsr_cuda_profiling.F
+++ b/src/acc/cuda/dbcsr_cuda_profiling.F
@@ -17,7 +17,7 @@ MODULE dbcsr_cuda_profiling
                           int_8
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
+!$ USE OMP_LIB, ONLY: omp_get_thread_num
 
    IMPLICIT NONE
 
diff --git a/src/acc/cuda_hip/acc_init.cpp b/src/acc/cuda_hip/acc_init.cpp
index b1e70178c0f..2a4b684ded3 100644
--- a/src/acc/cuda_hip/acc_init.cpp
+++ b/src/acc/cuda_hip/acc_init.cpp
@@ -26,8 +26,10 @@ extern "C" int c_dbcsr_acc_init() {
   ACC_DRV(device) acc_device;
   ACC_API_CALL(GetDevice, (&myDevice));
   ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice));
+#if defined(__CUDA)
   ACC_DRV(context) ctx;
   ACC_DRV_CALL(DevicePrimaryCtxRetain, (&ctx, acc_device));
+#endif
   ACC_API_CALL(RuntimeGetVersion, (&runtimeVersion));
 
   // Initialize libsmm_acc, DBCSR's GPU backend
@@ -41,6 +43,8 @@ extern "C" int c_dbcsr_acc_finalize() {
   ACC_DRV(device) acc_device;
   ACC_API_CALL(GetDevice, (&myDevice));
   ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice));
+#if defined(__CUDA)
   ACC_DRV_CALL(DevicePrimaryCtxRelease, (acc_device));
+#endif
   return libsmm_acc_finalize();
 }
diff --git a/src/acc/dbcsr_acc_device.F b/src/acc/dbcsr_acc_device.F
index 7b4d29f25c6..d9ec94526e3 100644
--- a/src/acc/dbcsr_acc_device.F
+++ b/src/acc/dbcsr_acc_device.F
@@ -13,6 +13,8 @@ MODULE dbcsr_acc_device
 #endif
 #include "base/dbcsr_base_uses.f90"
 
+!$ USE OMP_LIB, ONLY: omp_get_level
+
    IMPLICIT NONE
 
    PUBLIC :: dbcsr_acc_get_ndevices, dbcsr_acc_set_active_device, dbcsr_acc_clear_errors
@@ -83,11 +85,16 @@ SUBROUTINE dbcsr_acc_set_active_device(device_id)
 #if defined (__DBCSR_ACC)
       INTEGER :: istat
 
-!$OMP PARALLEL DEFAULT(NONE) PRIVATE(istat) SHARED(device_id)
-      istat = acc_set_active_device_cu(device_id)
+!$    IF (0 == omp_get_level()) THEN
+         istat = 0
+!$OMP    PARALLEL DEFAULT(NONE) SHARED(device_id) REDUCTION(MAX:istat)
+         istat = acc_set_active_device_cu(device_id)
+!$OMP    END PARALLEL
+!$    ELSE
+         istat = acc_set_active_device_cu(device_id)
+!$    END IF
       IF (istat /= 0) &
          DBCSR_ABORT("dbcsr_acc_set_active_device: failed")
-!$OMP END PARALLEL
 
 #else
       MARK_USED(device_id)
diff --git a/src/acc/libsmm_acc/README.md b/src/acc/libsmm_acc/README.md
index faa0aab6806..8978689c00b 100644
--- a/src/acc/libsmm_acc/README.md
+++ b/src/acc/libsmm_acc/README.md
@@ -12,12 +12,10 @@ For a description of the library (some details are outdated, but this neverthele
 
 ## Directory Organization
 
-- [`kernels/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/): GPU kernels (CUDA- and HIP-compatible) for matrix-matrix multiplication and python interface to autotuning and predictive code.
-- [`notebooks/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/): jupyter notebooks for exploring data generated from autotuning and prediction.
+- [`kernels/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/): GPU kernels (CUDA- and HIP-compatible) for matrix-matrix multiplication and Python interface to autotuning code.
 - `generate_*.py`: utility scripts for `libsmm_acc` compilation
 - `libsmm_acc*`: libsmm_acc C++ and CUDA / HIP code
-- [`parameters/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/parameters/): contains `parameters_GPU.json` files. These are sets of matrix-matrix multiplication parameters for different (m, n, k)-triplets optimized for a given GPU card. You can explore these parameters interactively using the [provided jupyter notebook](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb)
-- [`predict/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/): scripts for prediction of optimal parameter sets, see [predictive modeling of kernel parameters](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/README.md)
+- [`parameters/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/parameters/): contains `parameters_GPU.json` files. These are sets of matrix-matrix multiplication parameters for different (m, n, k)-triplets optimized for a given GPU card.
 - [`tune/`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/tune/): scripts for autotuning of optimal parameter sets, see [autotuning of kernel parameters](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/tune/README.md)
 
 ## Matrix-matrix Multiplication Kernels and Parameters
@@ -46,7 +44,7 @@ which take between 3 - 7 **parameters** (see figure at the top):
 - **w**: input slab width (width of slab `P_A` and `P_B`)
 - **v**: output slab width (width of slab `P_C`)
 
-The performance of the matrix-matrix multiplication kernels is highly dependent on the choice of algorithm and parameters. For this reason, `libsmm_acc` provides lists of optimal parameters for different GPU cards and different (m, n, k)-triplets. These sets of optimal parameters can be found either through *autotuning* or *predictive modeling*.
+The performance of the matrix-matrix multiplication kernels is highly dependent on the choice of algorithm and parameters. For this reason, `libsmm_acc` provides lists of optimal parameters for different GPU cards and different (m, n, k)-triplets.
 
 ## Contributing to libsmm_acc
 
@@ -56,19 +54,13 @@ We expect users to contribute to the library by providing new optimized kernels
 
 Follow the [autotuning procedure](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/tune/README.md)
 
-#### Predictive modeling of kernel parameters
-
-Follow the [predictive modeling procedure](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/README.md)
-
 #### Adding a new kernel
 
 1. Choose a kernel `name`
 
 2. Add the kernel's code (must be able to compile by both `nvcc` and `hip`) in file `kernels/smm_acc_dnt_name.h`
 
-3. Add python kernel class inheriting from base class `kernels/smm_acc_dnt_name.py`
-
-4. Add the new kernel to the `kernel_algorithm` data structure in [`kernels/smm_acc_predict.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_predict.py)
+3. Add Python kernel class inheriting from base class `kernels/smm_acc_dnt_name.py`
 
 #### Adding support for a new GPU card
 
@@ -85,4 +77,4 @@ Follow the [predictive modeling procedure](https://github.com/cp2k/dbcsr/blob/de
 }
 ```
 
-then add matrix-matrix multiplication parameters for this GPU using *autotuning* and *predictive modeling*
+then add matrix-matrix multiplication parameters for this GPU using *autotuning*.
diff --git a/src/acc/libsmm_acc/kernels/README.md b/src/acc/libsmm_acc/kernels/README.md
index 31e4b81eb61..14a268c3d3d 100644
--- a/src/acc/libsmm_acc/kernels/README.md
+++ b/src/acc/libsmm_acc/kernels/README.md
@@ -14,8 +14,6 @@
 
   * `smm_acc_dnt_ALGORITHM.h` Batched Multiply Kernel CUDA/HIP code
 
-* [`smm_acc_predict.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_predict.py) Class and helper functions for parameter prediction procedure
-
 * [`smm_acc_transpose.h`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_transpose.h) Transpose CUDA/HIP code
 
 ## Batched Multiplication Kernels
diff --git a/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h b/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h
index 7f70b2835d5..01e71d2d562 100644
--- a/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h
+++ b/src/acc/libsmm_acc/kernels/smm_acc_dnt_medium.h
@@ -422,7 +422,7 @@ __global__ void __launch_bounds__(threads, minblocks) smm_acc_dnt_medium(const i
         }
         if (need_sync) syncthreads();
 
-          /* Add results from shared memory buffer to global C block. */
+        /* Add results from shared memory buffer to global C block. */
 #pragma unroll
         for (int i = tidx; i < mn; i += threads) {
           atomicAdd(&c_data[srcC + i], buff[i]);
diff --git a/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h b/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h
index 767c02f4025..51f62b24a64 100644
--- a/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h
+++ b/src/acc/libsmm_acc/kernels/smm_acc_dnt_small.h
@@ -114,7 +114,7 @@ __global__ void __launch_bounds__(threads, minblocks) smm_acc_dnt_small(const in
   nrun = grouping;
   if (((bidx + 1) * grouping) > stack_size) nrun = stack_size % grouping;
 
-    /* Set the partial sum (tile T) to zero */
+  /* Set the partial sum (tile T) to zero */
 #pragma unroll
   for (int i = 0; i < M * N; i++) myc[i] = 0.0;
 
@@ -203,7 +203,7 @@ __global__ void __launch_bounds__(threads, minblocks) smm_acc_dnt_small(const in
 
         if (need_sync) syncthreads();
 
-          /* Add results from shared memory buffer to global C block. */
+        /* Add results from shared memory buffer to global C block. */
 #pragma unroll
         for (int i = tidx; i < mn; i += threads) atomicAdd(&c_data[srcC + i], buff[i]);
       }
diff --git a/src/acc/libsmm_acc/notebooks/README.md b/src/acc/libsmm_acc/notebooks/README.md
deleted file mode 100644
index df0114cf103..00000000000
--- a/src/acc/libsmm_acc/notebooks/README.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# libsmm_acc Notebooks
-
-Notebooks for exploring data generated from auto-tuning and prediction.
-
-**Requirements**
-Python version required: python 3.6+
-
-Install all python packages required (if you do not want this project's requirements to interfere with your other Python projects, consider doing so in a [virtual environment](https://docs.python.org/3/tutorial/venv.html)), using
-
-```bash
-pip install -r requirements.txt
-```
diff --git a/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb b/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb
deleted file mode 100644
index e2e971ede58..00000000000
--- a/src/acc/libsmm_acc/notebooks/inspect_autotuned_parameters.ipynb
+++ /dev/null
@@ -1,279 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "# `libcusmm`: explore the space of autotuned parameters"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This notebook can be used to explore the space of autotuned parameters, stored in files named `parameters_GPU.json`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Library imports"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import json, os"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Read data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Choose a GPU"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "GPU = 'P100' # Options: K20X, K40, K80, P100, V100, Mi50, Mi100, Mi250"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "params = '../parameters_' + GPU + '.json'  \n",
-    "assert os.path.exists(params)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### Read autotuned parameters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open(params) as f:\n",
-    "    all_parameters = pd.DataFrame([params for params in json.load(f)])\n",
-    "autotuned_parameters = all_parameters[all_parameters['source'] == 'autotuned']\n",
-    "print(\"Reading autotuned data from\", params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "ordered_columns = ['m', 'n', 'k', 'perf', 'algorithm', 'threads', 'grouping', 'minblocks', 'tile_m', 'tile_n', 'v', 'w']\n",
-    "autotuned_parameters = autotuned_parameters[ordered_columns]\n",
-    "print('Autotuned parameters:')\n",
-    "display(autotuned_parameters)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data Description"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print('Numer of columns:', len(autotuned_parameters.columns), '\\nNumber of rows:', len(autotuned_parameters.index.values))\n",
-    "print('\\nColumn names:')\n",
-    "for c in autotuned_parameters.columns.values: \n",
-    "    print(c)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "autotuned_parameters.describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "import pandas_profiling \n",
-    "pandas_profiling.ProfileReport(autotuned_parameters)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Plot performances"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%matplotlib inline \n",
-    "import matplotlib.pyplot as plt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "autotuned_parameters['mnk'] = autotuned_parameters['m'] * autotuned_parameters['n'] * autotuned_parameters['k']\n",
-    "plt.semilogx(autotuned_parameters['mnk'], autotuned_parameters['perf'], '.', markersize=3)\n",
-    "plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')\n",
-    "plt.ylabel('Performance [Gflops]')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Parameter frequencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# ignore the 'threads' parameter since it has to be adapted to the size of matrix C\n",
-    "parameter_set = ['algorithm', 'grouping', 'minblocks', 'tile_m', 'tile_n', 'v', 'w']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Most frequent parameter sets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def get_par_set(algorithm, grouping, minblocks, tile_m, tile_n, v, w):\n",
-    "    par_set= algorithm + '_' + str(int(grouping)) + '_' + str(int(minblocks)) \n",
-    "    if not np.isnan(tile_m):\n",
-    "        par_set +=  '_' + str(int(tile_m)) + '_' + str(int(tile_n))\n",
-    "        if not np.isnan(v):\n",
-    "            par_set += '_' + str(int(v)) + '_' + str(int(w))\n",
-    "    return par_set\n",
-    "    \n",
-    "vget = np.vectorize(get_par_set)\n",
-    "autotuned_parameters['param_set'] = vget(*[a for a in autotuned_parameters[parameter_set].values.transpose()])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "param_set_freq = autotuned_parameters['param_set'].value_counts(dropna=True)\n",
-    "autotuned_parameters['param_set_freq'] = autotuned_parameters['param_set'].apply(lambda item: param_set_freq[item])\n",
-    "autotuned_parameters.sort_values(by='param_set_freq', ascending=False, inplace=True)\n",
-    "autotuned_parameters.iloc[:50,:]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Most frequent parameters (independently of each other)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "most_frequent_values = dict()\n",
-    "for c in autotuned_parameters.columns.values: \n",
-    "    plt.figure\n",
-    "    plt.hist(autotuned_parameters[c].dropna(), bins=50)\n",
-    "    plt.title(c)\n",
-    "    plt.show()\n",
-    "    if c in parameter_set: \n",
-    "        col = autotuned_parameters[c].dropna().values\n",
-    "        values, counts = np.unique(col, return_counts=True)\n",
-    "        ind_most_freq = np.argmax(counts)\n",
-    "        most_freq_val = values[ind_most_freq]\n",
-    "        most_frequent_values[c] = most_freq_val"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
diff --git a/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb b/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb
deleted file mode 100644
index 7778bd35aef..00000000000
--- a/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb
+++ /dev/null
@@ -1,607 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "# `libcusmm`: Explore the Training Data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This notebook allows you to explore the training data collected from autotuning before proceeding to training."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Import libraries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import re, sys, os, json, random\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import dask.dataframe as dd\n",
-    "from nb_helper import *"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Read training data from autotuning folders \n",
-    "\n",
-    "Read from files of form `tune_*x*x*/raw_training_data_*x*x*_algo.csv`. \n",
-    "If you want to read from aggregated Parquet files (recommended), skip to lower"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Path to autotuning data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Provide the path to the autotuning data:\n",
-    "- You can use the bash cell below to navigate your filetree:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%bash\n",
-    "ls -ad AUTOTUNING_DATA_PATH/tune_*x*x*/"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- Then, copy what you've replaced `AUTOTUNING_DATA_PATH` with in the Python variable `autotuning_data_path` below:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "autotuning_data_path = '' # may not recognize '~', please provide an absolute path:\n",
-    "check_autotuning_data_path(autotuning_data_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Set options\n",
-    "\n",
-    "Set the following options appropriately:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "to_read = 100       # How many / which data folders to read. Options: \n",
-    "                    # - 'all': reads from all available data folders. \n",
-    "                    #   Beware, this might result in memory errors if large amounts of data are made available\n",
-    "                    # - a number: reads this number of data folders (e.g. 100)\n",
-    "                    # - a regex: reads the data folders with matching regex (e.g. tune_4x*x*)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "algorithm = get_algorithm_to_explore('all')   # algorithms to explore. Options: all, tiny, small, medium"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get the list of folders to read\n",
-    "folders_to_read = get_folders_to_read(to_read, autotuning_data_path)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Read training data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "raw_files_to_read, derived_files_to_read = get_files_to_read(folders_to_read, algorithm)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "num_f = len(files_to_read)\n",
-    "data_raw = dd.read_csv(raw_files_to_read, dtype={}).set_index(\"Unnamed: 0\")\n",
-    "data_derived = dd.read_csv(derived_files_to_read, dtype={}).set_index(\"Unnamed: 0\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# merge the two: "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Read training data from Parquet files\n",
-    "\n",
-    "Read from files of form `training_data_algorithm.parquet`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Path to autotuning data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Provide the path to the autotuning data:\n",
-    "- You can use the bash cell below to navigate your filetree:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%bash\n",
-    "ls -ad AUTOTUNING_DATA_PATH/*.parquet"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "- Then, copy what you've replaced `AUTOTUNING_DATA_PATH` with in the Python variable `training_data_path` below:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "training_data_path = '../tune_dataset_V100/' # may not recognize '~', please provide an absolute path:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "algorithm = \"small\" # algorithm to explore. Options: tiny, small, medium, largeDB1, largeDB2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "parquet_data_file = os.path.join(training_data_path, \"training_data_\" + algorithm + \".parquet\")\n",
-    "data = dd.read_parquet(parquet_data_file)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Data inspection\n",
-    "\n",
-    "### Data head"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "for i in range(0, len(data.columns.values), page_width):\n",
-    "    display(data.iloc[:,i:i+page_width].head())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Data description"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print('Data size        :', sys.getsizeof(data)/10**6, 'MB')\n",
-    "print('Number of columns:', len(data.columns.values))\n",
-    "print('Number of rows   : {:,}'.format(len(data.index)))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "#for i in range(0, len(data.columns.values), page_width):\n",
-    "#    display(data.iloc[:,i:i+page_width].describe())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "#print('Number of columns:', len(data.columns), '\\nNumber of rows:', len(data.index), '\\n')\n",
-    "for col in data.columns: \n",
-    "    print('{:<40} {}'.format(col, data[col].dtype))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Feature categories\n",
-    "mnk = ['m', 'n', 'k']\n",
-    "kernel_pars = ['algorithm', 'threads_per_blk', 'grouping', 'minblocks',\n",
-    "               'tile_m', 'tile_n', 'w', 'v', 'nbytes_smem', 'nbytes_cmem', 'regs_per_thread']\n",
-    "kernel_pars = list(set(kernel_pars) & set(data.columns.values))\n",
-    "perf =  ['perf (Gflop/s)', 'perf_scaled']\n",
-    "common = ['Gflops', 'mxnxk', 'size_a', 'size_b', 'size_c', 'nblks', \n",
-    "          'warps_per_blk', 'nwarps', 'sm_desired', 'nthreads', 'ru_param_stack_unroll_factor']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Features\n",
-    "\n",
-    "Features in the left-most column correspond to \"raw\" parameters\n",
-    "* **green** kernel parameters \n",
-    "* **grey** GPU card properties (taken from Nvidia/AMD documentation) \n",
-    "* **pink** autotuning parameters (taken from DBCSR codebase) \n",
-    "\n",
-    "Other features correspond to derived parameters, computed from the \"raw\" parameters\n",
-    "* **yellow** matrix sizes\n",
-    "* **light grey** launch parameters\n",
-    "* **blue** and **purple** estimations of resource usages"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "![parameters dependency graph](../../../../docs/media/images/libsmm_acc_predictive_modeling_features.png)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "thresh = 300000     # do not perform very long operations on row counts above this threshold"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "data_to_profile = data\n",
-    "n_rows_data = len(data)\n",
-    "if n_rows_data > thresh:  # if it is a very large dataframe, perform op on subsampled rows\n",
-    "    data_to_profile = data.sample(frac = thresh / n_rows_data)\n",
-    "\n",
-    "import pandas_profiling \n",
-    "pandas_profiling.ProfileReport(data_to_profile.compute())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Data visualization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%matplotlib inline\n",
-    "import matplotlib\n",
-    "import matplotlib.pyplot as plt\n",
-    "import seaborn as sns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Get Series from Dask to Pandas\n",
-    "data_mxnxk = data['mxnxk'].compute()\n",
-    "data_perf = data['perf (Gflop/s)'].compute()\n",
-    "data_perf_scaled = data['perf_scaled'].compute()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.semilogx(data_mxnxk, data_perf, '.', markersize=1)\n",
-    "plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')\n",
-    "plt.ylabel('Performance [Gflops]')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Data visualization (scaled performance)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.plot(data_mxnxk, data_perf_scaled, '.', markersize=1)\n",
-    "plt.xlabel('Training (m, n, k) triplets (in order of increasing m*n*k)')\n",
-    "plt.ylabel('Performance scaled (overall)')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Performance profile"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Choose (m, n, k) triplet\n",
-    "m_plot, n_plot, k_plot = (4, 4, 4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_mnk = data[data['m'] == m_plot][ \n",
-    "                data['n'] == n_plot][ \n",
-    "                data['k'] == k_plot].compute()\n",
-    "data_mnk.sort_values(by='perf (Gflop/s)', ascending=True, inplace=True)\n",
-    "plt.plot(data_mnk['perf (Gflop/s)'].values)\n",
-    "plt.xlabel('parameter set')\n",
-    "plt.ylabel('perf (Gflop/s)')\n",
-    "plt.title('Performance profile for kernel ' + str(m_plot) + 'x'+ str(n_plot) + 'x'+ str(k_plot))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Histograms with Bokeh\n",
-    "from bokeh.plotting import figure \n",
-    "from bokeh.models import ColumnDataSource, HoverTool\n",
-    "from bokeh.io import output_notebook, show\n",
-    "output_notebook()\n",
-    "\n",
-    "# Create histogram\n",
-    "num_bins = 100 \n",
-    "hist, edges = np.histogram(data_mnk['perf (Gflop/s)'], bins=num_bins)\n",
-    "df_hist = pd.DataFrame({'hist': hist, 'left': edges[:-1], 'right': edges[1:]})\n",
-    "source = ColumnDataSource(df_hist)\n",
-    "\n",
-    "# Create tool \n",
-    "hover = HoverTool(tooltips=[('# occurences', '@hist'), ('low', '@left'), ('high', '@right')])\n",
-    "\n",
-    "# Create the figure\n",
-    "p = figure(plot_width=800, plot_height=800, title=\"Performance histogram\",\n",
-    "           toolbar_location=None, tools=\"\")\n",
-    "p.xgrid.grid_line_color = None\n",
-    "p.xaxis.axis_label = \"Performance (GFlop/s)\"\n",
-    "p.xaxis.major_label_orientation = 1.2\n",
-    "p.yaxis.axis_label = \"# occurrences\"\n",
-    "p.quad(source=source, bottom=0, top='hist', left='left', right='right', fill_color='blue')\n",
-    "p.add_tools(hover)\n",
-    "show(p)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Histograms with Bokeh\n",
-    "from bokeh.plotting import figure \n",
-    "from bokeh.models import ColumnDataSource, HoverTool\n",
-    "from bokeh.io import output_notebook, show\n",
-    "output_notebook()\n",
-    "\n",
-    "# Create histogram\n",
-    "num_bins = 100 \n",
-    "hist, edges = np.histogram(data_mnk['perf_scaled'], bins=num_bins)\n",
-    "df_hist = pd.DataFrame({'hist': hist, 'left': edges[:-1], 'right': edges[1:]})\n",
-    "source = ColumnDataSource(df_hist)\n",
-    "\n",
-    "# Create tool \n",
-    "hover = HoverTool(tooltips=[('# occurences', '@hist'), ('low', '@left'), ('high', '@right')])\n",
-    "\n",
-    "# Create the figure\n",
-    "p = figure(plot_width=800, plot_height=800, title=\"Performance histogram\",\n",
-    "           toolbar_location=None, tools=\"\")\n",
-    "p.xgrid.grid_line_color = None\n",
-    "p.xaxis.axis_label = \"Performance scaled\"\n",
-    "p.xaxis.major_label_orientation = 1.2\n",
-    "p.yaxis.axis_label = \"# occurrences\"\n",
-    "p.quad(source=source, bottom=0, top='hist', left='left', right='right', fill_color='blue')\n",
-    "p.add_tools(hover)\n",
-    "show(p)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Top slices of perf. distribution\n",
-    "pars_autotuning_top = {\n",
-    "    5: list(), \n",
-    "    2: list(), \n",
-    "    1: list(), \n",
-    "    0.5: list()\n",
-    "}\n",
-    "max_perf = float(data_mnk['perf (Gflop/s)'].max())\n",
-    "max_perf_idx = data_mnk['perf (Gflop/s)'].idxmax()\n",
-    "max_perf_row = data_mnk.loc[max_perf_idx]\n",
-    "max_perf_cond = max_perf_row[mnk + kernel_pars + ['perf (Gflop/s)']]\n",
-    "\n",
-    "print('Maximally performing parameter set:')\n",
-    "display(max_perf_cond)\n",
-    "for perc in pars_autotuning_top.keys():\n",
-    "    lim = max_perf - max_perf*perc/100\n",
-    "    blob = data_mnk.loc[data_mnk['perf (Gflop/s)'] >= lim]\n",
-    "    print('\\ntop', perc, '%')\n",
-    "    display(blob[kernel_pars + ['perf (Gflop/s)']].describe())\n",
-    "    pars_autotuning_top[perc].append(blob)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Pair plot "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_pairplot = data\n",
-    "n_rows_data = len(data)\n",
-    "if n_rows_data > thresh:  # if it is a very large dataframe, perform op on subsampled rows\n",
-    "    data_pairplot = data.sample(frac = thresh / n_rows_data)\n",
-    "\n",
-    "sns.pairplot(data_pairplot[mnk + kernel_pars + perf].compute().dropna())"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
diff --git a/src/acc/libsmm_acc/notebooks/libsmm_acc_predictive_modeling_features.png b/src/acc/libsmm_acc/notebooks/libsmm_acc_predictive_modeling_features.png
deleted file mode 120000
index b77db205b4d..00000000000
--- a/src/acc/libsmm_acc/notebooks/libsmm_acc_predictive_modeling_features.png
+++ /dev/null
@@ -1 +0,0 @@
-../../../../docs/media/images/libsmm_acc_predictive_modeling_features.png
\ No newline at end of file
diff --git a/src/acc/libsmm_acc/notebooks/nb_helper.py b/src/acc/libsmm_acc/notebooks/nb_helper.py
deleted file mode 100644
index 1897b8b0320..00000000000
--- a/src/acc/libsmm_acc/notebooks/nb_helper.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# -*- coding: utf-8 -*-
-####################################################################################################
-# Copyright (C) by the DBCSR developers group - All rights reserved                                #
-# This file is part of the DBCSR library.                                                          #
-#                                                                                                  #
-# For information on the license, see the LICENSE file.                                            #
-# For further information please visit https://dbcsr.cp2k.org                                      #
-# SPDX-License-Identifier: GPL-2.0+                                                                #
-####################################################################################################
-
-import os
-import re
-
-
-# ===============================================================================
-# I/O
-# kernel_folder_pattern = re.compile('tune_(\d+)x(\d+)x(\d+)$')
-kernel_folder_pattern = re.compile(r"tune_(\d+x\d+x\d+)$")
-page_width = 5  # columns per output line
-
-
-def check_autotuning_data_path(autotuning_data_path):
-    # sanity checks
-    assert os.path.exists(autotuning_data_path), (
-        "This path does not exist: " + autotuning_data_path
-    )
-    assert len(os.listdir(autotuning_data_path)) > 0, (
-        "No folders found in path: " + autotuning_data_path
-    )
-    # print infos
-    print(
-        "Number of tuning data folders found: {}".format(
-            len(os.listdir(autotuning_data_path))
-        )
-    )
-
-
-def get_folders_to_read(to_read, autotuning_data_path):
-    if to_read == "all":
-        folders_to_read = [
-            os.path.join(autotuning_data_path, f)
-            for f in os.listdir(autotuning_data_path)
-            if kernel_folder_pattern.match(f) is not None
-        ]
-    elif isinstance(to_read, int):
-        folders_to_read = [
-            os.path.join(autotuning_data_path, f)
-            for f in os.listdir(autotuning_data_path)
-            if kernel_folder_pattern.match(f) is not None
-        ]
-        folders_to_read = folders_to_read[:to_read]
-    elif isinstance(to_read, str):
-        to_read = re.compile(to_read)
-        folders_to_read = [
-            os.path.join(autotuning_data_path, f)
-            for f in os.listdir(autotuning_data_path)
-            if to_read.match(f) is not None
-        ]
-    else:
-        raise AssertionError("Cannot recognize option: " + to_read)
-
-    num_folders_to_read = len(folders_to_read)
-    assert num_folders_to_read > 0
-    print("Data folders to be read from (total: {:,})\n".format(num_folders_to_read))
-    for f in folders_to_read:
-        print(f)
-
-    return folders_to_read
-
-
-def get_algorithm_to_explore(algo):
-    algo_to_read = (
-        [algo] if algo != "all" else ["tiny", "small", "medium", "largeDB1", "largeDB2"]
-    )
-    print("Algorithm(s) to explore:")
-    for a in algo_to_read:
-        print(a)
-
-    return algo_to_read
-
-
-def get_files_to_read(folders_to_read, algo_to_read):
-    files_to_read = list()
-    for i, kernel_folder in enumerate(folders_to_read):
-        print(
-            "\nfrom {}, read                                  ({}/{:,})".format(
-                kernel_folder, i + 1, len(folders_to_read)
-            )
-        )
-
-        for name_algo in algo_to_read:
-            mnk_string = kernel_folder_pattern.search(kernel_folder).groups()[0]
-            raw_file_base = "raw_training_data_" + mnk_string + "_" + name_algo + ".csv"
-            raw_file = os.path.join(kernel_folder, raw_file_base)
-            derived_file_base = "training_data_" + mnk_string + "_" + name_algo + ".csv"
-            derived_file = os.path.join(kernel_folder, derived_file_base)
-
-            if os.path.exists(raw_file) and os.path.exists(derived_file):
-                # Read raw parameters file
-                files_to_read.append(raw_file)
-
-                # Read derived parameters file
-                files_to_read.append(derived_file)
-
-            else:
-                if not os.path.exists(raw_file):
-                    print("\t...{:50} no file".format(raw_file_base))
-                if not os.path.exists(derived_file):
-                    print("\t...{:50} no file".format(derived_file_base))
-
-    return files_to_read
diff --git a/src/acc/libsmm_acc/notebooks/requirements.txt b/src/acc/libsmm_acc/notebooks/requirements.txt
deleted file mode 100644
index f36ef7a07a0..00000000000
--- a/src/acc/libsmm_acc/notebooks/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-bokeh==1.0.4
-matplotlib==3.0.2
-numpy==1.22.0
-pandas==0.23.4
-pandas-profiling==1.4.1
-seaborn==0.9.0
diff --git a/src/acc/libsmm_acc/predict/README.md b/src/acc/libsmm_acc/predict/README.md
deleted file mode 100644
index ddb967a2679..00000000000
--- a/src/acc/libsmm_acc/predict/README.md
+++ /dev/null
@@ -1,150 +0,0 @@
-# Training Procedure for Predictive Modeling of Optimal Parameters in `libsmm_acc`
-
-The performance of the matrix-matrix multiplication kernels is highly dependent on the choice of algorithm and parameters, this is why [*autotuning*](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/README.md) is used to find optimal kernel parameters.
-
-However, the auto-tuning procedure is expensive, and the space of (m,n,k)-triplets to explore is large. The following predictive modeling procedure is set up to predict optimal parameters for (m,n,k)-triplets that have not been auto-tuned from the data gathered from auto-tuning other (m,n,k)-triplets.
-
----
-
-### Requirements
-
-Python version required: `python 3.6+`
-
-Install all python packages required (if you do not want this project's requirements to interfere with your other Python projects, consider doing so in a [virtual environment](https://docs.python.org/3/tutorial/venv.html)), using
-
-```bash
-pip install -r requirements.txt
-```
-
----
-
-### Predictive parameters
-
-The input features for the predictive models can be 'raw' parameters, or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations).
-
----
-
-### Predictive modeling procedure
-
-#### 1. Get the data
-
-Get the data to be used for training, either by downloading data from the [dedicated repository](https://github.com/cp2k/dbcsr-data), or by auto-tuning new kernels yourself and combining them with pre-existing data.
-
-##### 1.a Download pre-collected data from dedicated repository
-
-- Download data from the dedicated repository:
-
-  ```bash
-  wget https://github.com/cp2k/dbcsr-data/blob/master/GPU/raw_training_data_ALGORITHM.csv  # for ALGORITHM = tiny, small, medium, largeDB1, largeDB2
-  ```
-
-- Compute derived parameters from raw parameters and create a record of baseline and maximum performances: run [`prepare_training_data.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/prepare_training_data.py), providing the CUDA/HIP architecture number and the location of the downloaded data:
-
-  ```bash
-  ./prepare_training_data.py # –arch 60 --folder /scratch/autotuning_dataset, e.g.
-  ```
-
-##### 1.b (optional) Aquire data from auto-tuning
-
-- We would appreciate if you would upload the data resulting from your auto-tuning procedure to the [dedicated repository](https://github.com/cp2k/dbcsr-data). For this, please take note, at this stage, of the [information required to upload your data](https://github.com/cp2k/dbcsr-data/blob/master/git-commit.template).
-
-- If you're auto-tuning data for a new GPU, make sure that the GPU's compute architecture properties are given in the file [`kernels/gpu_properties.json`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/gpu_properties.json). If not, please add them.
-
-- Follow the [instructions for auto-tuning](tune.md).
-
-- If all went well, you now have directories named `tune_mxnxk` containing log files in which parameter sets and their corresponding measured performances are recorded.
-
-- Collect the information in all the `tune_mxnxk` directories into CSV files: run [`predict_collect.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/predict/predict_collect.py), providing the location of the auto-tuning data:
-
-  ```bash
-  ./predict_collect.py # --folder /scratch/autotuning_dataset, e.g.
-  ```
-
-You should now have 5 CSV files containing raw data (`raw_training_data_ALGORITHM.csv`, for `ALGORITHM = tiny, small, medium, largeDB1, largeDB2`)
-
-#### 2. Prepare the data for predictive modeling
-
-A few steps are needed to make the data ready for training:
-
-- Record maximum and baseline performances of (m,n,k)-triplets in JSON files
-- Compute derived training data and write it to a CSV file
-- Compress training data files from CSV to Parquet files
-
-```bash
-./prepare_training_data.py  # --folder /scratch/autotuning_dataset -a 60 -j12, e.g. to run with 12 threads
-```
-
-The data preparation is relatively computationally expensive, especially for large data sets.
-A good way of running it, is to
-
-1. Compute just the maximum and baseline parameters for each algorithm separately (`-l ALGORITHM --skip_derived_data=True`), adjusting the `-j` parameter so it runs fast enough, while not running into "out-of-memory"-errors
-2. Run again with `--skip_derived_data=True` to create the files that aggregate maximum and baseline performances for all algorithms.
-3. Compute derived data records for each algorithm separately (`-l ALGORITHM`), adjusting the `-j` option.
-4. Run the script again without specifying the algorithm nor skipping the derived data to make sure all necessary files have been generated.
-
-##### At the end, you should end up with the following files:
-
-- `raw_training_data_ALGORITHM.csv` (containing all *raw* parameters for training a model for algorithm ALGORITHM, obtained in step 1)
-- `training_data_ALGORITHM.csv` (containing all *derived* parameters for training a model for algorithm ALGORITHM)
-- `training_data_ALGORITHM.parquet` (containing all *raw* and *derived* parameters for training a model for algorithm ALGORITHM in Parquet files, convenient for reading in parallel using Dask)
-- `baseline_performances_ALGORITHM.json` and `baseline_performances_by_algo.json` (containing, for each (m, n, k)-triplet in the training data, its baseline performance, i.e. its performance were it to be run with a set of parameters that are an expert's "best guess"). Additionally, the baseline performances are plotted in `baseline_performances.svg`.
-- `maximum_performances_ALGORITHM.json`, `max_performances_by_algo.json` and `max_performances.json` (containing, for each (m, n, k)-triplet, its maximum performance). Additionally, the maximum performances are plotted in `maximum_performances.svg`.
-
-#### 3. (optional) Explore the data
-
-Explore the data interactively using the [provided Jupyter notebook](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb).
-
-#### 4. Train
-
-For each algorithm, build a predictive model using decision trees and feature selection based on the features' permutation importance.
-
-```bash
-./predict_train.py  # --algo medium --folder /scratch/autotuning_dataset, e.g.
-```
-
-Use the command-line parameters `--folder` and `--destination_folder` to choose the folder from which data is read, as well as the folder to which models, logs, etc. are written.
-Repeat this step for all algorithms.
-This may take several hours. For example, training algorithm 'medium' for the P100 took 11 hours on a single Greina (CSCS) node.
-Moreover, depending on the size of the training data, large amounts of memory may be needed. For example, training algorithm 'medium' for the P100 was run on a 192 GB node.
-
-#### 5. Generate optimal parameters
-
-Given predictive models (in the form of serialized [scikit-learn](https://scikit-learn.org/) model objects) for all unseen (m,n,k)s, generate or update a file of optimal parameters
-
-```bash
-./predict_genpars.py  -c 5000 \  # chunk size
-    -j 12 \ # 12 threads
-    --largeDB2 /scratch/largeDB2/feature_tree_refit.p \ # path to models
-    --largeDB1 /scratch/largeDB1/feature_tree_refit.p \
-    --medium /scratch/medium/feature_tree_refit.p \
-    --small /scratch/small/feature_tree_refit.p \
-    --tiny /scratch/tiny/feature_tree_refit.p
-```
-
-This may take several hours. For example, generating parameters for the P100 took 8 hours on a single Piz Daint (CSCS) node. For this reason, intermediate results are stored in JSON files in a folder `predict_genpars_ckpt`. Once this script has finished running, and you've successfully obtained a new `parameters_GPU.json` file, you may delete the checkpoint folder `predict_genpars_ckpt`.
-
-#### 6. Evaluate the predicted parameters
-
-```bash
-./predict_evaluate.py -f libsmm_acc_predicted.out -n libsmm_acc_baseline.out
-```
-
-#### 7. Contribute your new parameters and data
-
-##### Contribute training data
-
-See [instructions](https://github.com/cp2k/dbcsr-data#contributing) in our [dedicated repository](https://github.com/cp2k/dbcsr-data)
-
-##### Contribute predicted parameters
-
-Submit a pull request updating the `parameters_GPU.json` file in question.
-
----
-
-### Contributing to the training procedure
-
-#### Adding a new predictive feature
-
-- Choose the new feature's name, "`NAME`"
-- Add the feature as a method of `class PredictiveParameters`, named `get_NAME`
-- Add the derived feature to the data structure `derived_parameters` in [`kernels/smm_acc_predict.py`](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/kernels/smm_acc_predict.py)
diff --git a/src/acc/libsmm_acc/predict/predict_collect.py b/src/acc/libsmm_acc/predict/predict_collect.py
deleted file mode 100755
index ab41ebe1de2..00000000000
--- a/src/acc/libsmm_acc/predict/predict_collect.py
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-####################################################################################################
-# Copyright (C) by the DBCSR developers group - All rights reserved                                #
-# This file is part of the DBCSR library.                                                          #
-#                                                                                                  #
-# For information on the license, see the LICENSE file.                                            #
-# For further information please visit https://dbcsr.cp2k.org                                      #
-# SPDX-License-Identifier: GPL-2.0+                                                                #
-####################################################################################################
-
-import sys
-import os
-import re
-import glob
-import argparse
-import pandas as pd
-
-sys.path.append("../")
-
-from kernels.smm_acc import to_string, kernel_algorithm, parameter_types  # noqa: E402
-
-
-# ===============================================================================
-def main(tunedir):
-    """
-    Once autotuning of new kernels has been run,
-    - collect the parameter information and performance from log files,
-    - dump them to CSV files for data analysis and training of a predictive model
-    """
-    # ===============================================================================
-    # Check for old data files first
-    for algorithm in kernel_algorithm.keys():
-        training_data_file = os.path.join(tunedir, f"raw_training_data_{algorithm}.csv")
-        if os.path.exists(training_data_file):
-            print(
-                f"WARNING: Found old data file {training_data_file}, re(move) it first ... exiting"
-            )
-            sys.exit(1)
-
-    # Find all the 'tune_MxNxK' folders
-    kernel_folder_pattern = re.compile(r"tune_(\d+)x(\d+)x(\d+)$")
-    kernel_folders = [
-        os.path.join(tunedir, ak)
-        for ak in os.listdir(tunedir)
-        if kernel_folder_pattern.match(ak) is not None
-    ]
-    n_kernels = len(kernel_folders)
-    assert n_kernels > 0, (
-        "Found no kernel folders of format"
-        + str(kernel_folder_pattern)
-        + " in folder "
-        + tunedir
-    )
-    print(f"Found {n_kernels} kernel folders")
-
-    # Collect information and write to csv
-    collect_training_data(kernel_folders, kernel_folder_pattern)
-
-    # Print commands to merge CSVs into one big CSV for training data
-    merge_data_files(tunedir)
-
-
-# ===============================================================================
-# Helper variables and functions (formatting & writing)
-autotuning_line = re.compile(
-    r"OK Kernel_dnt_(\w+) m (\d+)\s+n (\d+)\s+k (\d+)\s+"
-    + r"(?:tile_m (\d+)\s+tile_n (\d+)\s+(?:w (\d+)\s+v (\d+)\s+)?)?"
-    + r"threads (\d+)\s+grouping (\d+)\s+minblocks (\d+)\s+GFlop/s (\d+(?:\.\d+)?)"
-)
-
-
-def read_log_file(log_folder, m, n, k):
-    """
-    Given a folder of kernel autotuning, read and parse the autotuning information in the log file
-    and return it in the form of a pandas Dataframe.
-    :param log_folder: folder of kernel autotuning
-    :return: pandas Dataframe containing autotuning information
-    """
-    # Find log files in the log folder
-    log_files = [f for f in os.listdir(log_folder) if f[-4:] == ".log"]
-    assert len(log_files) > 0
-    log_files = sorted(log_files)
-
-    # Parse the log files and collect data
-    data = list()
-    for log_file in log_files:
-        print(f"Processing log file {log_file}")
-        with open(os.path.join(log_folder, log_file), "r") as f:
-            log_file_content = f.read().splitlines()
-
-        for line in log_file_content:
-            if "OK" in line:  # this line contains autotuning data
-                # Parse the line
-                match = autotuning_line.match(line)
-                assert match is not None, "Found null match: " + line
-
-                # Get algorithm, parameters, and performance
-                data.append(
-                    {
-                        "m": m,
-                        "n": n,
-                        "k": k,
-                        "algorithm": match.group(1),
-                        "threads": match.group(9),
-                        "grouping": match.group(10),
-                        "minblocks": match.group(11),
-                        "tile_m": (
-                            match.group(5) if match.group(5) is not None else None
-                        ),
-                        "tile_n": (
-                            match.group(6) if match.group(6) is not None else None
-                        ),
-                        "w": match.group(7) if match.group(7) is not None else None,
-                        "v": match.group(8) if match.group(8) is not None else None,
-                        "perf (Gflop/s)": match.group(12),
-                    }
-                )
-
-    print(f"{len(data)} autotuning lines found")
-
-    # Merge dictionaries into a pandas dataframe
-    dataframe = pd.DataFrame(data)
-    for col in dataframe.columns:
-        dataframe[col] = dataframe[col].astype(parameter_types[col], errors="ignore")
-
-    return dataframe
-
-
-def collect_training_data(kernel_folders, kernel_folder_pattern):
-    """
-    Collect training data from log files resulting of autotuning
-    """
-
-    # ===============================================================================
-    # For each folder:
-    n_kernels = len(kernel_folders)
-    for i, kernel_folder in enumerate(kernel_folders):
-        print(f"\nProcess folder {kernel_folder} ({i+1}/{n_kernels})")
-
-        # Find (m, n, k)
-        # Each folder contains data for just one (m, n, k) but potentially mutliple algorithms
-        match = kernel_folder_pattern.search(kernel_folder).groups()
-        m = int(match[0])
-        n = int(match[1])
-        k = int(match[2])
-
-        # ===============================================================================
-        # Collect info from log files
-        log_files = [f for f in os.listdir(kernel_folder) if f[-4:] == ".log"]
-        if len(log_files) > 0:
-            data = read_log_file(kernel_folder, m, n, k)
-        else:
-            print(f"No log files found in folder {kernel_folder} ... skipping")
-            continue
-
-        # ===============================================================================
-        # Write parameters to CSV
-        for name_algo, kernel_algo in kernel_algorithm.items():
-            # if applicable to this mnk
-            if name_algo in data["algorithm"].values:
-                # Does collected csv file exist already?
-                raw_parameters_file_name = os.path.join(
-                    kernel_folder,
-                    "raw_training_data_"
-                    + to_string(m, n, k)
-                    + "_"
-                    + name_algo
-                    + ".csv",
-                )
-
-                if os.path.exists(raw_parameters_file_name):
-                    print(f"Found csv file {raw_parameters_file_name} ... skipping")
-                else:
-                    # Get the data corresponding to this algorithm
-                    data_algo = data[data["algorithm"] == name_algo]
-                    # Write raw parameters
-                    pars_to_get = kernel_algo.launch_parameters + ["perf (Gflop/s)"]
-                    data_algo[pars_to_get].to_csv(raw_parameters_file_name, index=False)
-                    print("Wrote", raw_parameters_file_name)
-
-
-# ===============================================================================
-def merge_data_files(tunedir):
-    """
-    Merge CSV files
-    """
-    for algorithm in kernel_algorithm.keys():
-        training_data_file = os.path.join(
-            tunedir, "raw_training_data_{algorithm}.csv".format(algorithm=algorithm)
-        )
-
-        if os.path.exists(training_data_file):
-            print(f"\nFound {training_data_file} ... skipping")
-            os.rename(training_data_file, f"{training_data_file}.bak")
-
-        print(f"\nMerging partial CSV files into {training_data_file} ... ")
-
-        filenames_pattern = os.path.join(
-            tunedir,
-            "tune_*/raw_training_data_*_{algorithm}.csv".format(algorithm=algorithm),
-        )
-        print("Merging all files with pattern:", filenames_pattern)
-        filenames = glob.glob(filenames_pattern)
-        if len(filenames) == 0:
-            print("Found no files matching this pattern ... skipping")
-
-        else:
-            print(f"Found {len(filenames)} files matching this pattern")
-
-            with open(training_data_file, "w") as out:
-                # Write the first file, including its header
-                fn_1 = filenames.pop(0)
-                with open(fn_1) as f:
-                    header_line_ref = next(f)  # read header line
-                    out.write(header_line_ref)  # write header line
-                    out.write(f.read())  # write the rest of the file
-                # Write the rest of the files, skipping the header line each time
-                for i, fn in enumerate(filenames):
-                    print("writing from {} ({}/{})".format(fn, i + 1, len(filenames)))
-                    with open(fn) as f:
-                        header_line = next(f)  # skip header line
-                        assert header_line == header_line_ref, (
-                            'Cannot merge file "'
-                            + fn
-                            + '", because its header line:\n'
-                            + header_line
-                            + 'is different from the header line of file "'
-                            + fn_1
-                            + '":\n'
-                            + header_line_ref
-                        )
-                        out.write(f.read())
-
-            print("Wrote to {}".format(training_data_file))
-
-
-# ===============================================================================
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""
-        Collect matrix-matrix multiplication parameters and performances measured during autotuning. For that,
-        parse the log files created by the autotuning and record parameter sets and their performances to CSV files.
-
-        This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters.
-        For more details, see README.md.
-        """,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "-f",
-        "--folder",
-        metavar="FOLDER",
-        type=str,
-        default=".",
-        help="Folder in which the folders tune_*x*x*x/ are to be found",
-    )
-    parser.add_argument(
-        "-a",
-        "--arch",
-        metavar="ARCHITECTURE_NUMBER",
-        type=int,
-        default=80,
-        help="GPU architecture code. Options: sm_35, sm_37, sm_60, sm_70, sm_80, gfx906",
-    )
-
-    args = parser.parse_args()
-    main(args.folder)
diff --git a/src/acc/libsmm_acc/predict/predict_evaluate.py b/src/acc/libsmm_acc/predict/predict_evaluate.py
deleted file mode 100755
index a5b3de7f4af..00000000000
--- a/src/acc/libsmm_acc/predict/predict_evaluate.py
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-####################################################################################################
-# Copyright (C) by the DBCSR developers group - All rights reserved                                #
-# This file is part of the DBCSR library.                                                          #
-#                                                                                                  #
-# For information on the license, see the LICENSE file.                                            #
-# For further information please visit https://dbcsr.cp2k.org                                      #
-# SPDX-License-Identifier: GPL-2.0+                                                                #
-####################################################################################################
-
-import re
-import numpy as np
-import argparse
-from predict_helpers import (
-    performance_gain,
-    relative_performance_gain,
-    plot_absolute_performance_gain,
-    plot_relative_performance_gain,
-    plot_performance_gains,
-)
-
-
-# ===============================================================================
-def main(file, file_baseline):
-    """
-    Given a file containing the results of the LBSMM_ACC performance test, perform evaluation of the predictive model.
-    """
-    # ===============================================================================
-    # Read optimal-parameter-prediction result file
-    with open(file) as f:
-        result_file = f.read().splitlines()
-    results_predictive_model = read_result_file(result_file)
-
-    # Read baseline result file
-    with open(file_baseline) as f:
-        result_file = f.read().splitlines()
-    results_baseline = read_result_file(result_file)
-
-    # ===============================================================================
-    # Performance comparison quantities
-    improved_over_baseline = dict(
-        zip(
-            sorted(results_predictive_model.keys()),
-            [
-                results_predictive_model[(m, n, k)] > results_baseline[(m, n, k)]
-                for m, n, k in sorted(results_predictive_model.keys())
-            ],
-        )
-    )
-    perf_gain_over_baseline = performance_gain(
-        results_baseline, results_predictive_model
-    )
-    rel_perf_gain_over_baseline = relative_performance_gain(
-        results_baseline, results_predictive_model
-    )
-
-    # ===============================================================================
-    # Print results
-    header = "m, n, k: baseline perf. [Gflops], predictive model perf. [Gflops], performance gain [? ]"
-    print(header)
-    line = (
-        "{m:>2}, {n:>2}, {k:>2}: {baseline_perf:>7.2f}, {predictive_model_perf:>7.2f}, "
-        + "{performance_gain:>7.2f}, {better}"
-    )
-    for m, n, k in sorted(results_predictive_model.keys()):
-        print(
-            line.format(
-                m=m,
-                n=n,
-                k=k,
-                baseline_perf=results_baseline[(m, n, k)],
-                predictive_model_perf=results_predictive_model[(m, n, k)],
-                performance_gain=perf_gain_over_baseline[(m, n, k)],
-                better=improved_over_baseline[(m, n, k)],
-            )
-        )
-
-    print(
-        "\nKernel performances improved by predictive model:",
-        list(improved_over_baseline.values()).count(True),
-        "/",
-        len(results_predictive_model.keys()),
-    )
-    perf_gain_improved = [pg for pg in perf_gain_over_baseline.values() if pg > 0]
-    print(
-        "Mean performance gain amongst improved kernels: {:.2f} Gflops".format(
-            np.mean(perf_gain_improved)
-        )
-    )
-
-    print(
-        "\nKernel performances reduced by predictive model:",
-        list(improved_over_baseline.values()).count(False),
-        "/",
-        len(results_predictive_model.keys()),
-    )
-    perf_gain_deteriorated = [pg for pg in perf_gain_over_baseline.values() if pg < 0]
-    print(
-        "Mean performance loss amongst deteriorated kernels: {:.2f} Gflops".format(
-            np.mean(perf_gain_deteriorated)
-        )
-    )
-
-    print(
-        "\nMean performance gain overall: {:.2f} Gflops".format(
-            np.mean(list(perf_gain_over_baseline.values()))
-        )
-    )
-
-    # ===============================================================================
-    # Plot results (testing set: predictive modelling VS naïve)
-    plot_absolute_performance_gain(
-        perf_gain_over_baseline, "non-autotuned", "baseline", "predictive model"
-    )
-    plot_relative_performance_gain(
-        rel_perf_gain_over_baseline, "non-autotuned", "baseline", "predictive model"
-    )
-    plot_performance_gains(
-        results_predictive_model,
-        results_baseline,
-        "non-autotuned",
-        "baseline",
-        "predictive model",
-    )
-
-
-# ===============================================================================
-def read_result_file(file):
-    results = dict()
-    result_line = re.compile(r"OK (\d+) x (\d+) x (\d+) GFlop/s (\d+(?:\.\d+)?)")
-    for line in file:
-        match = result_line.match(line)
-        if match is not None:
-            m = int(match.group(1))
-            n = int(match.group(2))
-            k = int(match.group(3))
-            perf = float(match.group(4))
-            results[(m, n, k)] = perf
-
-    return results
-
-
-# ===============================================================================
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""
-        Given a file containing the results of the LIBSMM_ACC performance test, perform evaluation of the predictive
-        model.
-
-        This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters.
-        For more details, see README.md.
-        """,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "-f",
-        "--file",
-        metavar="filename.out",
-        type=str,
-        default="",
-        help="Result file to evaluate. Output of tests/libsmm_acc_timer_multiply.cpp",
-    )
-    parser.add_argument(
-        "-n",
-        "--file_baseline",
-        metavar="filename.out",
-        type=str,
-        default="",
-        help="Baseline performance file to compare against.",
-    )
-
-    args = parser.parse_args()
-    main(args.file, args.file_baseline)
diff --git a/src/acc/libsmm_acc/predict/predict_genpars.py b/src/acc/libsmm_acc/predict/predict_genpars.py
deleted file mode 100755
index 61f377053ce..00000000000
--- a/src/acc/libsmm_acc/predict/predict_genpars.py
+++ /dev/null
@@ -1,406 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-####################################################################################################
-# Copyright (C) by the DBCSR developers group - All rights reserved                                #
-# This file is part of the DBCSR library.                                                          #
-#                                                                                                  #
-# For information on the license, see the LICENSE file.                                            #
-# For further information please visit https://dbcsr.cp2k.org                                      #
-# SPDX-License-Identifier: GPL-2.0+                                                                #
-####################################################################################################
-
-import gc
-import os
-import sys
-import json
-import pandas as pd
-from itertools import product
-import argparse
-from joblib import Parallel, delayed
-from predict_helpers import safe_pickle_load
-from warnings import simplefilter
-
-simplefilter(action="ignore", category=UserWarning)
-
-sys.path.append("../")
-from kernels.smm_acc import to_tuple, to_string  # noqa: E402
-from kernels.smm_acc_predict import (  # noqa: E402
-    gpu_architectures,
-    kernel_algorithm,
-    params_dict_to_kernel,
-    PredictiveParameters,
-)
-
-# The joblib backend spawns additional processes, which do not inherit the warning filters applied using warnings.filterwarnings
-os.environ["PYTHONWARNINGS"] = "ignore::UserWarning"
-
-
-# ===============================================================================
-def main(params, njobs, baseline, paths_to_models, chunk_size):
-    """
-    Update parameter file with new optimal parameter predictions given newly trained decision trees
-    """
-    # ===============================================================================
-    # Load GPU and autotuning properties
-    assert (
-        os.path.basename(params) in gpu_architectures.keys()
-    ), "Cannot find compute version for file " + str(params)
-    arch_code = gpu_architectures[os.path.basename(params)]
-    with open("../kernels/gpu_properties.json") as f:
-        gpu_properties = json.load(f)[arch_code]
-    with open("../kernels/autotuning_properties.json") as f:
-        autotuning_properties = json.load(f)
-
-    # Load autotuned kernel parameters
-    with open(params) as f:
-        all_kernels = [params_dict_to_kernel(**params) for params in json.load(f)]
-    print("libsmm_acc: Found %d existing parameter sets." % len(all_kernels))
-    autotuned_mnks = [(k.m, k.n, k.k) for k in all_kernels if k.autotuned]
-    autotuned_kernels_ = [k for k in all_kernels if k.autotuned]
-    autotuned_kernels = dict(zip(autotuned_mnks, autotuned_kernels_))
-
-    # ===============================================================================
-    # Construct the list of (m,n,k)-triplets for which parameter sets should be made available to libcusmm
-    mnks = combinations(list(range(4, 46)))
-    mnks = set.union(set(mnks), set(autotuned_kernels.keys()))
-
-    # ===============================================================================
-    # Compute parameter sets
-    mnks_to_predict = list()
-    kernels_to_print = dict()
-    for m, n, k in mnks:
-        if (m, n, k) in autotuned_kernels.keys():
-            kernels_to_print[(m, n, k)] = autotuned_kernels[(m, n, k)]
-        else:
-            mnks_to_predict.append((m, n, k))
-
-    if baseline:
-        kernels = get_baseline_kernels(
-            mnks_to_predict, gpu_properties, autotuning_properties
-        )
-    else:
-        kernels = get_optimal_kernels(
-            mnks_to_predict,
-            njobs,
-            chunk_size,
-            paths_to_models,
-            gpu_properties,
-            autotuning_properties,
-            1,
-        )
-
-    kernels_to_print.update(kernels)
-
-    # ===============================================================================
-    # Write to file
-    with open(params, "w") as f:
-        s = json.dumps(
-            [
-                kernels_to_print[kernel].as_dict_for_parameters_json
-                for kernel in sorted(kernels_to_print.keys())
-            ]
-        )
-        s = s.replace("}, ", "},\n")
-        s = s.replace("[", "[\n")
-        s = s.replace("]", "\n]")
-        f.write(s)
-    print("Wrote new predicted parameters to file", params)
-
-
-# ===============================================================================
-# Helpers
-def combinations(sizes):
-    return list(product(sizes, sizes, sizes))
-
-
-def remove_empty_entries(ld):
-    """
-    Given a list of dictionaries "ld", remove its list elements that are empty dicts
-    """
-    return [d for d in ld if d]  # empty dictionaries evaluate to False
-
-
-def find_optimal_kernel(
-    mnk, algo, tree, tree_features, gpu_properties, autotuning_properties
-):
-    """
-    Find the optimal kernel parameter set for a given (m, n, k) and a given algorithm
-    :return: optimal_kernels: dictionary, keys: (m, n, k), values: Kernel object describing best parameters
-    """
-
-    # Get parameter space for this (m, n, k) and this algorithm
-    m, n, k = mnk
-    parameter_space_ = kernel_algorithm[algo].promising_parameters(
-        m, n, k, gpu_properties, autotuning_properties
-    )
-    parameter_space = pd.DataFrame(parameter_space_)
-    del parameter_space_
-    parameter_space["algorithm"] = [algo] * len(
-        parameter_space.index
-    )  # Add "algorithm" column
-    if len(parameter_space.index) == 0:
-        optimal_kernels = dict()
-
-    else:
-        # Get predictor features from raw parameters
-        parameter_sets = PredictiveParameters(
-            parameter_space, gpu_properties, autotuning_properties, None
-        )
-        predictors = parameter_sets.get_features(tree_features)
-        if algo == "medium":
-            predictors = predictors.rename(
-                columns=dict(
-                    zip(
-                        predictors.columns,
-                        [
-                            "f{}".format(i)
-                            for i in range(0, len(predictors.columns) + 1)
-                        ],
-                    )
-                )
-            )
-
-        # Predict performances
-        performances_scaled = tree.predict(predictors)
-        del predictors
-        parameter_performances = parameter_sets.params
-        del parameter_sets
-        parameter_performances["perf"] = performances_scaled
-        del performances_scaled
-
-        # Pick optimal kernel
-        optimal_kernel = max(
-            parameter_performances.to_dict("records"), key=lambda x: x["perf"]
-        )
-        del parameter_performances
-        optimal_kernels = dict()
-        optimal_kernels[(m, n, k)] = params_dict_to_kernel(
-            **optimal_kernel, source="predicted"
-        )
-
-    return optimal_kernels
-
-
-def get_optimal_kernels(
-    mnks_to_predict,
-    njobs,
-    chunk_size,
-    paths_to_models,
-    gpu_properties,
-    autotuning_properties,
-    top_k,
-):
-    # optimal_kernels_list is a list of dictionaries
-    # - keys: (m, n, k),
-    # - values: Kernel object describing best parameters
-    # - number of elements in each dictionary = top_k
-    # each element of the list corresponds to the search of optimal kernels for a given mnk and a given algorithm
-
-    print("Getting optimal kernels")
-
-    # ===============================================================================
-    # Load predictive trees and feature list
-    tree = dict()
-    kernel_to_investigate = dict()
-    for algo in kernel_algorithm.keys():
-        path_to_model = paths_to_models[algo]
-        if path_to_model is not None:
-            print(
-                "Algorithm: {:<8}, loading model from: {}".format(algo, path_to_model)
-            )
-            tree[algo] = dict()
-            tree[algo]["file"] = path_to_model
-            features, tree[algo]["tree"] = safe_pickle_load(tree[algo]["file"])
-            tree[algo]["features"] = features.tolist()
-            kernel_to_investigate[algo] = kernel_algorithm[algo]
-        else:
-            print("Algorithm: {:<8}, no model found.".format(algo))
-
-    if len(kernel_to_investigate) == 0:
-        print("No model found. Specify path to predictive models using ")
-        sys.exit(1)
-
-    # ===============================================================================
-    # Get mnks_by_algo to compute:
-    mnks_by_algo = list(product(mnks_to_predict, kernel_to_investigate.keys()))
-    num_mnks_by_algo = len(mnks_by_algo)
-    optimal_kernels_list = list()
-    ckpt_folder_name = "predict_genpars_ckpt"
-
-    if not os.path.exists(ckpt_folder_name):
-        os.mkdir(ckpt_folder_name)
-    print("Caching intermediate results to:", ckpt_folder_name)
-
-    for i in range(0, num_mnks_by_algo, chunk_size):
-        # Chunk up tasks
-        start_chunk = i
-        end_chunk = int(min(start_chunk + chunk_size, num_mnks_by_algo))
-        print(f"Completed {i} tasks out of {num_mnks_by_algo}")
-
-        # Create checkpoint file or load checkpointed data from it
-        checkpoint_file_name = os.path.join(
-            ckpt_folder_name, f"chunk_{start_chunk}-{end_chunk - 1}.json"
-        )
-
-        if os.path.exists(checkpoint_file_name):
-            with open(checkpoint_file_name, "r") as f:
-                optimal_kernels_list__ = json.load(f)
-                optimal_kernels_list_ = list()
-                for i, optker in enumerate(optimal_kernels_list__):
-                    optimal_kernels_list_.append({})
-                    for k, v in optker.items():
-                        algo = v.pop("algorithm")
-                        optimal_kernels_list_[i][to_tuple(k)] = kernel_algorithm[algo](
-                            **v
-                        )
-            print(f"Read chunk {start_chunk}-{end_chunk - 1}\n")
-
-        else:
-            if njobs == 1:
-                j = i
-                optimal_kernels_list_ = list()
-                # Ignore joblib and run serially:
-                for mnk, algo in mnks_by_algo[start_chunk:end_chunk]:
-                    j += 1
-                    gc.collect()
-                    print(
-                        f"{j:6d} of {num_mnks_by_algo}: Find optimal kernels for mnk = {mnk} algo = {algo}"
-                    )
-                    optker = find_optimal_kernel(
-                        mnk,
-                        algo,
-                        tree[algo]["tree"],
-                        tree[algo]["features"],
-                        gpu_properties,
-                        autotuning_properties,
-                    )
-                    if optker:
-                        optimal_kernels_list_.append(optker)
-
-            else:
-                # Run prediction tasks in parallel with joblib
-                optimal_kernels_list_ = Parallel(n_jobs=njobs, verbose=2)(
-                    delayed(find_optimal_kernel, check_pickle=True)(
-                        mnk,
-                        algo,
-                        tree[algo]["tree"],
-                        tree[algo]["features"],
-                        gpu_properties,
-                        autotuning_properties,
-                    )
-                    for mnk, algo in mnks_by_algo[start_chunk:end_chunk]
-                )
-                optimal_kernels_list_ = remove_empty_entries(optimal_kernels_list_)
-
-            with open(checkpoint_file_name, "w") as f:
-                optimal_kernels_list__ = list()
-                for i, optker in enumerate(optimal_kernels_list_):
-                    optimal_kernels_list__.append({})
-                    for k, v in optker.items():
-                        optimal_kernels_list__[i][to_string(k)] = v.as_dict
-                json.dump(optimal_kernels_list__, f)
-                print(f"Checkpoint file {checkpoint_file_name} written")
-
-        optimal_kernels_list += optimal_kernels_list_
-
-    print("Finished gathering candidates for optimal parameter space")
-
-    # Group optimal kernel candidates by (m,n,k) in a dictionary
-    optimal_kernels_mnk_algo = dict()
-    for optimal_kernel_mnk in optimal_kernels_list:
-        for mnk, kernels_mnk in optimal_kernel_mnk.items():
-            m, n, k = mnk
-            if (m, n, k) in optimal_kernels_mnk_algo.keys():
-                optimal_kernels_mnk_algo[(m, n, k)].append(kernels_mnk)
-            else:
-                optimal_kernels_mnk_algo[(m, n, k)] = [kernels_mnk]
-
-    # Find optimal kernel per mnk among the different algorithm possibilities
-    optimal_kernels = dict()
-    for mnk, candidate_kernels in optimal_kernels_mnk_algo.items():
-        m, n, k = mnk
-        optimal_kernel_mnk = sorted(
-            candidate_kernels, key=lambda x: x.perf, reverse=True
-        )[:top_k]
-        optimal_kernels[(m, n, k)] = optimal_kernel_mnk[0]
-
-    return optimal_kernels
-
-
-def get_baseline_kernels(mnks_to_predict, gpu_propertes, autotuning_properties):
-    print("Getting baseline kernels")
-    baseline_algorithm = "medium"
-    baseline_kernels = list()
-    for m, n, k in mnks_to_predict:
-        baseline_kernels[(m, n, k)] = kernel_algorithm[baseline_algorithm].baseline(
-            m, n, k, gpu_propertes, autotuning_properties
-        )
-
-    return baseline_kernels
-
-
-# ===============================================================================
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""
-        Update parameter file with new optimal parameter predictions given newly trained decision trees.
-
-        This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters.
-        For more details, see README.md.
-        """,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-
-    parser.add_argument(
-        "-p",
-        "--params",
-        metavar="parameters_GPU.json",
-        default="../parameters/parameters_A100.json",
-        help="Parameter file to read and update with predictions",
-    )
-    parser.add_argument(
-        "-j", "--njobs", type=int, default=-1, help="Number of joblib jobs"
-    )
-    parser.add_argument(
-        "--baseline",
-        default=False,
-        help="Generate a parameter file corresponding to the baseline of a predictive model",
-    )
-    parser.add_argument(
-        "--tiny",
-        default=None,
-        help="Path to model trained for algorithm 'tiny'. If not given, ignore this algorithm.",
-    )
-    parser.add_argument(
-        "--small",
-        default=None,
-        help="Path to model trained for algorithm 'small'. If not given, ignore this algorithm.",
-    )
-    parser.add_argument(
-        "--medium",
-        default=None,
-        help="Path to model trained for algorithm 'medium'. If not given, ignore this algorithm.",
-    )
-    parser.add_argument(
-        "--largeDB1",
-        default=None,
-        help="Path to model trained for algorithm 'largeDB1'. If not given, ignore this algorithm.",
-    )
-    parser.add_argument(
-        "--largeDB2",
-        default=None,
-        help="Path to model trained for algorithm 'largeDB2'. If not given, ignore this algorithm.",
-    )
-    parser.add_argument(
-        "-c",
-        "--chunk_size",
-        type=int,
-        default=5000,
-        help="Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number",
-    )
-
-    args = parser.parse_args()
-    paths_to_models = dict()
-    for algo in kernel_algorithm.keys():
-        paths_to_models[algo] = args.__dict__[algo]
-    main(args.params, args.njobs, args.baseline, paths_to_models, args.chunk_size)
diff --git a/src/acc/libsmm_acc/predict/predict_helpers.py b/src/acc/libsmm_acc/predict/predict_helpers.py
deleted file mode 100644
index 890d793f003..00000000000
--- a/src/acc/libsmm_acc/predict/predict_helpers.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# -*- coding: utf-8 -*-
-####################################################################################################
-# Copyright (C) by the DBCSR developers group - All rights reserved                                #
-# This file is part of the DBCSR library.                                                          #
-#                                                                                                  #
-# For information on the license, see the LICENSE file.                                            #
-# For further information please visit https://dbcsr.cp2k.org                                      #
-# SPDX-License-Identifier: GPL-2.0+                                                                #
-####################################################################################################
-
-import sys
-import os
-import pickle
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-
-sys.path.append("../")
-from kernels.smm_acc import to_string  # noqa: E402
-
-
-# ===============================================================================
-# I/O helpers
-def safe_pickle(data, file):
-    """
-    Pickle big files safely by processing them in chunks.
-    This wrapper is a workaround for a bug on OSX (https://bugs.python.org/issue24658)
-
-    :param data: data to be pickled
-    :param file: file to pickle it into
-    """
-    max_bytes = 2**31 - 1  # Maximum number of bytes to write in one chunk
-    pickle_out = pickle.dumps(data)
-    n_bytes = len(pickle_out)
-    with open(file, "wb") as f:
-        count = 0
-        for i in range(0, n_bytes, max_bytes):
-            f.write(pickle_out[i : min(n_bytes, i + max_bytes)])
-            count += 1
-
-
-def safe_pickle_load(file_path):
-    """
-    Load big pickled files safely by processing them in chunks
-    This wrapper is a workaround a bug on OSX (https://bugs.python.org/issue24658)
-
-    :param data: data to be loaded through pickle
-    :param file: file to read from
-    """
-    max_bytes = 2**31 - 1  # Maximum number of bytes to read in one chunk
-    bytes_in = bytearray(0)
-    input_size = os.path.getsize(file_path)
-    with open(file_path, "rb") as f:
-        for _ in range(0, input_size, max_bytes):
-            bytes_in += f.read(max_bytes)
-    return pickle.loads(bytes_in)
-
-
-# ===============================================================================
-# Model evaluation helpers
-def performance_gain(baseline, current):
-    """
-    Compute the absolute perfomance gain, in Gflop/s between a baseline and a 'current'
-    :param baseline, current: dictionary, keys: (m, n, k), values: performance in Gflop/s
-    :return: dictionary, keys: (m, n, k), values: performance difference in Gflop/s
-    """
-    return dict(
-        zip(
-            sorted(current.keys()),
-            [
-                current[(m, n, k)] - baseline[(m, n, k)]
-                for m, n, k in sorted(current.keys())
-            ],
-        )
-    )
-
-
-def relative_performance_gain(baseline, current):
-    """
-    Compute the relative perfomance gain (no units), between a baseline and a 'current'
-    :param baseline, current: dictionary, keys: (m, n, k), values: performance in Gflop/s
-    :return: dictionary, keys: (m, n, k), values: relative performance difference (no units)
-    """
-    return dict(
-        zip(
-            sorted(current.keys()),
-            [
-                (current[(m, n, k)] - baseline[(m, n, k)]) / baseline[(m, n, k)]
-                for m, n, k in sorted(current.keys())
-            ],
-        )
-    )
-
-
-def plot_absolute_performance_gain(
-    perf_gain, mnk_names, baseline_name, current_name, pp=None
-):
-    mnk_products = [
-        m * n * k
-        for m, n, k in sorted(perf_gain.keys(), key=lambda x: x[0] * x[1] * x[2])
-    ]
-
-    plt.figure()
-    plt.plot(mnk_products, list(perf_gain.values()), ".", markersize=3)
-    plt.plot([mnk_products[0], mnk_products[-1]], [0, 0], "-r")
-    plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)")
-    plt.ylabel("Performance Gain [Gflops]")
-    plt.title(
-        "Performance gain of "
-        + current_name
-        + " VS "
-        + baseline_name
-        + " parameter set"
-    )
-    if pp is not None:
-        pp.savefig()
-    else:
-        plt.show()
-    plt.close()
-
-
-def plot_relative_performance_gain(
-    rel_perf_gain, mnk_names, baseline_name, current_name, pp=None
-):
-    mnk_products = [
-        m * n * k
-        for m, n, k in sorted(rel_perf_gain.keys(), key=lambda x: x[0] * x[1] * x[2])
-    ]
-
-    plt.figure()
-    plt.plot(
-        mnk_products, 100 * np.array(list(rel_perf_gain.values())), ".", markersize=3
-    )
-    plt.plot([mnk_products[0], mnk_products[-1]], [0, 0], "-r")
-    plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)")
-    plt.ylabel("Performance Gain [%]")
-    plt.title(
-        "Relative performance gain of "
-        + current_name
-        + " VS "
-        + baseline_name
-        + " parameter set"
-    )
-    if pp is not None:
-        pp.savefig()
-    else:
-        plt.show()
-    plt.close()
-
-
-def plot_performance_gains(
-    perf_gain1, perf_gain2, mnk_names, perf_gain1_name, perf_gain2_name, pp=None
-):
-    mnks = [
-        (m, n, k)
-        for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2])
-    ]
-    mnk_products = [
-        m * n * k
-        for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2])
-    ]
-    res1 = [perf_gain1[mnk] for mnk in mnks]
-    res2 = [perf_gain2[mnk] for mnk in mnks]
-
-    marker_size = 3
-    plt.figure()
-    plt.plot(mnk_products, res1, ".", markersize=marker_size)
-    plt.plot(mnk_products, res2, ".", color="#d62728", markersize=marker_size)
-    plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)")
-    plt.ylabel("Performance [Gflops]")
-    plt.xscale("log")
-    plt.legend([perf_gain1_name, perf_gain2_name])
-    plt.title(
-        "Performance of "
-        + perf_gain1_name
-        + " and "
-        + perf_gain2_name
-        + " parameter set"
-    )
-    if pp is not None:
-        pp.savefig()
-    else:
-        plt.show()
-    plt.close()
-
-
-def plot_scaled_performance_gains(
-    perf_gain1, perf_gain2, mnk_names, perf_gain1_name, perf_gain2_name, pp=None
-):
-    mnks = [
-        (m, n, k)
-        for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2])
-    ]
-    mnk_products = [
-        m * n * k
-        for m, n, k in sorted(perf_gain2.keys(), key=lambda x: x[0] * x[1] * x[2])
-    ]
-    res1 = np.array([perf_gain1[mnk] for mnk in mnks])
-    res2 = np.array([perf_gain2[mnk] for mnk in mnks])
-
-    marker_size = 3
-    plt.figure()
-    plt.plot(mnk_products, 100 * res1, ".", markersize=marker_size)
-    plt.plot(mnk_products, 100 * res2, ".", color="#d62728", markersize=marker_size)
-    plt.xlabel(mnk_names + " (m, n, k) triplets (in order of increasing m*n*k)")
-    plt.ylabel("Scaled performance [%]")
-    plt.xscale("log")
-    plt.legend([perf_gain1_name, perf_gain2_name])
-    plt.title(
-        "Performance of "
-        + perf_gain1_name
-        + " and "
-        + perf_gain2_name
-        + " parameter set"
-    )
-    if pp is not None:
-        pp.savefig()
-    else:
-        plt.show()
-    plt.close()
-
-
-def plot_choice_goodness(
-    m,
-    n,
-    k,
-    baseline_performances,
-    max_performances,
-    y_true,
-    y_pred,
-    train,
-    pp,
-    scaled=True,
-):
-    # Sort in ascending performances
-    data_mnk = pd.DataFrame()
-    if scaled:
-        data_mnk["perf_true"] = (100 * y_true).tolist()
-        data_mnk["perf_pred"] = (100 * y_pred).tolist()
-    else:
-        data_mnk["perf_true"] = y_true.flatten().tolist()
-        data_mnk["perf_pred"] = y_pred.tolist()
-    data_mnk.sort_values(by="perf_true", inplace=True)
-
-    # Plot
-    plt.figure()
-    marker_size = 1
-    par_set_ids = range(len(data_mnk.index.values))
-    plt.plot(
-        par_set_ids,
-        data_mnk["perf_true"],
-        "b.",
-        markersize=marker_size,
-        label="measured performances",
-    )
-    plt.xlabel("Parameter set id")
-    plt.ylabel("Percentage of autotuned performance achieved [%]")
-    type = "train" if train else "test"
-    plt.title(
-        "Performance profile of parameter sets for "
-        + str((m, n, k))
-        + "-triplet ("
-        + type
-        + ")"
-    )
-
-    # Annotate
-    x = [0, len(y_true)]
-    y = np.array([1, 1])
-    perf_num = "{:2.2f}"
-
-    # chosen
-    idx_perf_chosen = data_mnk["perf_pred"].idxmax()
-    perf_chosen = data_mnk["perf_true"][idx_perf_chosen]
-    plt.plot(
-        x,
-        perf_chosen * y,
-        "r-",
-        label="perf of chosen param set: " + perf_num.format(perf_chosen) + "%",
-    )
-
-    # baseline
-    if scaled:
-        # baseline = per algo, scale it to 0-1
-        perf_baseline = (
-            100
-            * baseline_performances[to_string(m, n, k)]
-            / max_performances["{}x{}x{}".format(m, n, k)]
-        )
-    else:
-        perf_baseline = baseline_performances[to_string(m, n, k)]
-    plt.plot(
-        x,
-        perf_baseline * y,
-        "g-",
-        label="perf of baseline param set: " + perf_num.format(perf_baseline) + "%",
-    )
-
-    plt.legend(loc="lower right")
-    pp.savefig()
-    plt.close()
diff --git a/src/acc/libsmm_acc/predict/predict_train.py b/src/acc/libsmm_acc/predict/predict_train.py
deleted file mode 100755
index cf2b3845202..00000000000
--- a/src/acc/libsmm_acc/predict/predict_train.py
+++ /dev/null
@@ -1,1685 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-####################################################################################################
-# Copyright (C) by the DBCSR developers group - All rights reserved                                #
-# This file is part of the DBCSR library.                                                          #
-#                                                                                                  #
-# For information on the license, see the LICENSE file.                                            #
-# For further information please visit https://dbcsr.cp2k.org                                      #
-# SPDX-License-Identifier: GPL-2.0+                                                                #
-####################################################################################################
-
-import os
-import sys
-import datetime
-import json
-import random
-import numpy as np
-import pandas as pd
-import xgboost as xgb
-import dask.dataframe as dd
-import matplotlib.pyplot as plt
-import argparse
-from predict_helpers import (
-    safe_pickle,
-    safe_pickle_load,
-    plot_choice_goodness,
-    plot_performance_gains,
-    plot_scaled_performance_gains,
-    plot_absolute_performance_gain,
-    plot_relative_performance_gain,
-    performance_gain,
-)
-
-sys.path.append("../")
-from kernels.smm_predict import to_tuple, to_string  # noqa: E402
-
-visual_separator = (
-    "\n----------------------------------------------------------------------------"
-)
-
-
-# ===============================================================================
-def main(
-    datadir,
-    destdir,
-    algo,
-    model_args,
-    nrows,
-    prefitted_model_folder,
-    run_intermediate_evaluation,
-):
-    """
-    Train a Machine Learning model on autotuning data to predict a kernel's performance given
-    its template parameters
-    """
-    # ===============================================================================
-    # Create folder to store results of this training and start a log
-    folder, log_file, log = get_log_folder(prefitted_model_folder, destdir, algo)
-
-    # ===============================================================================
-    # Override algorithm option if working on a pre-fitted model, and log program options
-    log += print_and_log(visual_separator)
-    algo, model_args, nrows, log = dump_or_load_options(
-        algo, model_args, prefitted_model_folder, nrows, folder, log
-    )
-
-    # ===============================================================================
-    # Get maximum and baseline performances
-    (
-        max_performances,
-        max_performances_algo,
-        max_performances_ref,
-        baseline_performances_algo,
-    ) = get_reference_performances(datadir, algo)
-
-    # ===============================================================================
-    # Read data
-    log += print_and_log(visual_separator)
-    X, X_mnk, Y, log, data_nrows = read_data(algo, datadir, nrows, folder, log)
-
-    # ===============================================================================
-    # AT THIS POINT, WE MOVE FROM DASK (out-of-memory dataframes) TO PANDAS
-    # ===============================================================================
-    log += print_and_log("[moving to pandas] Compute X ...")
-    X = X.compute()
-    log += print_and_log("[moving to pandas] Compute Y ...")
-    Y = Y.compute()
-    log += print_and_log("[moving to pandas] Compute X_mnk ...")
-    X_mnk = X_mnk.compute()
-    log += print_and_log("[moving to pandas] Done")
-
-    # ===============================================================================
-    # Get or train partial model (i.e. trained on the "training" part of the data, not the entire dataset)
-    log += print_and_log(visual_separator)
-    if len(prefitted_model_folder) == 0:  # train a model
-        log += print_and_log("\nPreparing to fit model...")
-        (
-            X_train,
-            Y_train,
-            X_mnk_train,
-            X_test,
-            Y_test,
-            X_mnk_test,
-            model_partial,
-            log,
-        ) = train_model(X, X_mnk, Y, algo, model_args, folder, log)
-
-    else:  # load pre-trained model
-        log += print_and_log(
-            "\nReading partial pre-fitted partial model from " + prefitted_model_folder
-        )
-        (
-            X_train,
-            Y_train,
-            X_mnk_train,
-            X_test,
-            Y_test,
-            X_mnk_test,
-            model_partial,
-            log,
-        ) = fetch_pre_trained_model_partial(
-            X, X_mnk, Y, model_args, prefitted_model_folder, log
-        )
-
-    # ===============================================================================
-    # Evaluate partial model
-    if model_partial is not None:
-        log = evaluate_model(
-            model_partial,
-            X_train,
-            X_mnk_train,
-            Y_train,
-            X_test,
-            X_mnk_test,
-            Y_test,
-            max_performances_ref,
-            max_performances_algo,
-            baseline_performances_algo,
-            data_nrows,
-            log,
-            folder,
-        )
-
-    # ===============================================================================
-    # Refit to the entire dataset
-    # Get or train model fit on the entire dataset (i.e. not just on the "training" part of the data)
-    model_file = os.path.join(prefitted_model_folder, "feature_tree_refit.p")
-    if (
-        run_intermediate_evaluation
-        or len(prefitted_model_folder) == 0
-        or not os.path.exists(model_file)
-    ):
-        log += print_and_log(visual_separator)
-        log += print_and_log("\nRefit to the entire dataset:")
-        X = X_train.append(X_test, ignore_index=True)
-        X_mnk = X_mnk_train.append(X_mnk_test, ignore_index=True)
-        Y = Y_train.append(Y_test, ignore_index=True)
-        model_partial.fit(X, Y)
-        model = (
-            model_partial  # This model is fit on the entire dataset, it is not partial
-        )
-        results_file = os.path.join(folder, "feature_tree_refit.p")
-        safe_pickle([X.columns.values, model], results_file)
-    else:
-        log += print_and_log(
-            "\nReading pre-fitted model from " + prefitted_model_folder
-        )
-        X, model, log = fetch_pre_trained_model(prefitted_model_folder, X, log)
-
-    # ===============================================================================
-    # Evaluate refit-model
-    log = evaluate_model(
-        model,
-        X,
-        X_mnk,
-        Y,
-        None,
-        None,
-        None,
-        max_performances_ref,
-        max_performances_algo,
-        baseline_performances_algo,
-        data_nrows,
-        log,
-        folder,
-    )
-
-    # ===============================================================================
-    # Print log
-    log += print_and_log(visual_separator)
-    with open(log_file, "w") as f:
-        f.write(log)
-
-
-# ===============================================================================
-# Model hyperparameters
-optimized_hyperparameters = {
-    # chosen by hyperparameter optimization. The optimal parameter depends on the GPU, the data ...
-    # the values below are the average of the optimal value for the P100 and the V100
-    "tiny": {
-        "scikit_max_depth": 16,
-        "scikit_min_samples_leaf": 2,
-        "scikit_min_samples_split": 15,
-        "xgboost_max_depth": 12,
-        "xgboost_learning_rate": 0.1,
-        "xgboost_n_estimators": 100,
-    },
-    "small": {
-        "scikit_max_depth": 16,
-        "scikit_min_samples_leaf": 2,
-        "scikit_min_samples_split": 15,
-        "xgboost_max_depth": 14,
-        "xgboost_learning_rate": 0.1,
-        "xgboost_n_estimators": 170,
-    },
-    "medium": {
-        "scikit_max_depth": 18,
-        "scikit_min_samples_leaf": 2,
-        "scikit_min_samples_split": 13,
-        "xgboost_max_depth": 14,
-        "xgboost_learning_rate": 0.1,
-        "xgboost_n_estimators": 140,
-    },
-    "largeDB1": {
-        "scikit_max_depth": 18,
-        "scikit_min_samples_leaf": 2,
-        "scikit_min_samples_split": 15,
-        "xgboost_max_depth": 14,
-        "xgboost_learning_rate": 0.1,
-        "xgboost_n_estimators": 170,
-    },
-    "largeDB2": {
-        "scikit_max_depth": 18,
-        "scikit_min_samples_leaf": 2,
-        "scikit_min_samples_split": 15,
-        "xgboost_max_depth": 14,
-        "xgboost_learning_rate": 0.1,
-        "xgboost_n_estimators": 170,
-    },
-}
-
-
-# ===============================================================================
-# Printing and dumping helpers
-def get_log_folder(prefitted_model_folder, destination_folder, algo):
-    """Create a unique log folder for this run in which logs, plots etc. will be stored"""
-    if len(prefitted_model_folder) == 0:
-        # Create a new folder for this model
-        file_signature = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M")
-        folder_name = os.path.join(
-            "model_selection", os.path.join(algo, file_signature)
-        )
-        if destination_folder != ".":
-            folder = os.path.join(destination_folder, folder_name)
-        else:
-            folder = folder_name
-        log_file = os.path.join(folder, "log.txt")
-        if not os.path.exists(folder):
-            while True:  # loop until we've created a folder
-                try:
-                    os.makedirs(folder)
-                    break
-                except FileExistsError:
-                    time_stamp_seconds = datetime.datetime.now().strftime("-%S")
-                    new_folder = folder + time_stamp_seconds
-                    print(
-                        "Folder {} exists already. Trying to create folder {}.".format(
-                            folder, new_folder
-                        )
-                    )
-                    folder = new_folder
-
-    else:
-        # If loading a pre-fitted model, use this pre-fitted model's folder as a log folder, but create a new log file
-        folder = prefitted_model_folder
-        log_file_signature = datetime.datetime.now().strftime("%Y-%m-%d--%H-%M")
-        log_file = os.path.join(folder, "log_" + log_file_signature + ".txt")
-
-    # Log folder and file
-    log = ""
-    log += print_and_log("\nLogging to:")
-    log += print_and_log("\t" + folder)
-    log += print_and_log("\t" + log_file)
-
-    return folder, log_file, log
-
-
-def dump_or_load_options(algo, model_args, prefitted_model, nrows, folder, log):
-    options_file_name = os.path.join(folder, "options.json")
-    pgm_options = {"folder": folder, "algo": algo, "nrows": nrows}
-    pgm_options.update(model_args)
-
-    if len(prefitted_model) == 0:
-        # if we're training a model, dump options to folder so they can be reloaded in another run
-        print("Dump options to", options_file_name)
-        with open(options_file_name, "w") as f:
-            json.dump(pgm_options, f)
-
-    else:
-        # if we're using a pre-fitted model, load options from that model
-        print("Read options from", options_file_name)
-        with open(options_file_name, "r") as f:
-            pgm_options = json.load(f)
-
-        algo = pgm_options["algo"]
-        model_args_list = ["model", "splits", "ntrees", "njobs"]
-        model_args = dict()
-        for m in model_args_list:
-            model_args[m] = pgm_options[m]
-        nrows = pgm_options["nrows"]
-
-    # Log options
-    log += print_and_log("Predict-train running with options:")
-    for opt, opt_val in pgm_options.items():
-        log += print_and_log("{:<15}: {}".format(opt, opt_val))
-
-    return algo, model_args, nrows, log
-
-
-def print_and_log(msg):
-    if not isinstance(msg, str):
-        msg = str(msg)
-    log = "\n" + msg
-    print(msg)
-    return log
-
-
-def dask_to_pandas(*dfs):
-    """Convert training data dask -> pandas"""
-    pd_dfs = [df.compute() for df in dfs]
-    return pd_dfs[0] if len(pd_dfs) == 1 else pd_dfs
-
-
-def pandas_to_dask(*dfs):
-    """Convert training data pandas -> dask"""
-    dd_dfs = [dd.from_pandas(df, npartitions=3) for df in dfs]
-    return dd_dfs[0] if len(dd_dfs) == 1 else dd_dfs
-
-
-# ===============================================================================
-# Custom loss functions and scorers
-def perf_loss(y_true, y_pred, top_k, X_mnk, scaled=True):
-    """
-    Compute the relative performance losses per mnk if one were to measure the top-k best predicted sets of parameters
-    and pick the best out of this top-k
-
-    :param y_true: ground truth performances (performance scaled between 0 and 1)
-    :param y_pred: estimated performances (performance scaled between 0 and 1)
-    :param top_k: number of top performances to measure
-    :param X_mnk: corresponding mnks
-    :return: perf_losses: array of relative performance losses (in %), one array element per mnk
-    """
-    assert len(y_true.index) == y_pred.flatten().size
-    assert len(y_true.index) == len(X_mnk.index)
-
-    perf_losses = list()
-    mnks = np.unique(X_mnk["mnk"].values)
-    for mnk in mnks:
-        # Get performances per mnk
-        idx_mnk = np.where(X_mnk == mnk)[0].tolist()
-        assert len(idx_mnk) > 0, "idx_mnk is empty"
-        y_true_mnk = y_true.iloc[idx_mnk]
-        y_pred_mnk = y_pred[idx_mnk]
-
-        # Get top-k best predicted performances
-        if top_k != 1:
-            top_k_idx = np.argpartition(-y_pred_mnk, top_k)[:top_k]
-        else:
-            top_k_idx = np.argmax(y_pred_mnk)
-        y_correspmax = y_true_mnk.iloc[top_k_idx]
-
-        # Chosen max perf. among predicted max performances
-        maxperf_chosen = np.amax(y_correspmax)
-
-        # True Max. performances
-        if not scaled:
-            maxperf = float(y_true_mnk.max(axis=0))
-            assert maxperf >= 0, "Found non-positive value for maxperf: " + str(maxperf)
-            perf_loss = (maxperf - maxperf_chosen) / maxperf
-        else:
-            perf_loss = 1.0 - maxperf_chosen
-
-        # Relative performance loss incurred by using model-predicted parameters instead of autotuned ones [%]
-        perf_losses.append(100 * perf_loss)
-
-    return perf_losses
-
-
-def worse_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled=True):
-    y = np.array(perf_loss(y_true, y_pred, top_k, X_mnk, scaled))
-    return float(y.max(axis=0))
-
-
-def mean_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled=True):
-    y = np.array(perf_loss(y_true, y_pred, top_k, X_mnk, scaled))
-    return float(y.mean(axis=0))
-
-
-def worse_case_scorer(estimator, X, y, top_k):
-    """
-    :param estimator: the model that should be evaluated
-    :param X: validation data
-    :param y: ground truth target for X
-    :return: score: a floating point number that quantifies the estimator prediction quality on X, with reference to y
-    """
-    mnk = dd.DataFrame()
-    mnk["mnk"] = X["mnk"].copy()
-    y_pred = estimator.predict(X.drop(["mnk"].values, axis=1))
-    score = worse_rel_perf_loss_of_k(y, y_pred, top_k, mnk)
-    return (
-        -score
-    )  # by scikit-learn convention, higher numbers are better, so the value should be negated
-
-
-def worse_case_scorer_top1(estimator, X, y):
-    return worse_case_scorer(estimator, X, y, 1)
-
-
-def mean_scorer(estimator, X, y, top_k):
-    """
-    :param estimator: the model that should be evaluated
-    :param X: validation data
-    :param y: ground truth target for X
-    :return: score: a floating point number that quantifies the estimator prediction quality on X, with reference to y
-    """
-    mnk = dd.DataFrame()
-    mnk["mnk"] = X["mnk"].copy()
-    y_pred = estimator.predict(X.drop(["mnk"].values, axis=1))
-    score = mean_rel_perf_loss_of_k(y, y_pred, top_k, mnk)
-    return (
-        -score
-    )  # by scikit-learn convention, higher numbers are better, so the value should be negated
-
-
-def mean_scorer_top1(estimator, X, y):
-    return mean_scorer(estimator, X, y, 1)
-
-
-# ===============================================================================
-# Read and prepare data
-def get_reference_performances(folder, algo):
-    import json
-
-    maxperf_file = os.path.join(folder, "max_performances.json")
-    with open(maxperf_file) as f:
-        max_performances = json.load(f)
-
-    maxperf_file = os.path.join(folder, "max_performances_by_algo.json")
-    with open(maxperf_file) as f:
-        max_performances_algo = json.load(f)[algo]
-
-    max_performances_ref = max_performances
-
-    baseline_file = os.path.join(folder, "baseline_performances_by_algo.json")
-    with open(baseline_file) as f:
-        baseline_performances_algo = json.load(f)[algo]
-
-    return (
-        max_performances,
-        max_performances_algo,
-        max_performances_ref,
-        baseline_performances_algo,
-    )
-
-
-def read_data(algo, read_from, nrows, folder, log):
-    parquet_data_file = os.path.join(read_from, "training_data_" + algo + ".parquet")
-    log += print_and_log("\nRead data from " + parquet_data_file)
-
-    # ===============================================================================
-    # Get 'X'
-    cols_to_ignore = [
-        "perf_scaled",
-        "mnk",
-        "perf (Gflop/s)",
-        "perf_scaled_by_algo",
-        "perf_squared",
-    ]
-    X = dd.read_parquet(parquet_data_file)
-    cols_to_drop = set(cols_to_ignore).intersection(set(X.columns.values))
-    log += print_and_log("\nDropping following columns from X:\n" + str(cols_to_drop))
-    X = X.drop(cols_to_drop, axis=1)
-    log += print_and_log(
-        "X    : {:>8,} x {:>8,} ({:>2.2} MB)".format(
-            len(X), len(X.columns), sys.getsizeof(X) / 10**6
-        )
-    )
-    log += print_and_log("Head:")
-    log += print_and_log(X.head())
-    n_features = len(list(X.columns))
-    predictor_names = X.columns.values
-    log += print_and_log("\nPredictor variables: (" + str(n_features) + ")")
-    for i, p in enumerate(predictor_names):
-        log += print_and_log("\t{:2}) {}".format(i + 1, p))
-
-    # ===============================================================================
-    # Get 'Y'
-    log += print_and_log("\nRead Y")
-    Y = dd.read_parquet(parquet_data_file, columns=["perf_scaled"])
-    log += print_and_log(
-        "Y    : {:>8,} ({:>2.2} MB)".format(len(Y), sys.getsizeof(Y) / 10**6)
-    )
-    log += print_and_log("Head:")
-    log += print_and_log(Y.head())
-
-    # ===============================================================================
-    # Get 'X_mnk'
-    log += print_and_log("\nRead X_mnk")
-    X_mnk = dd.read_parquet(parquet_data_file, columns=["mnk"])
-    nrows_data = len(X_mnk.index)
-    log += print_and_log(
-        "X_mnk : {:>8,} ({:>2.2} MB)".format(nrows_data, sys.getsizeof(X_mnk) / 10**6)
-    )
-    log += print_and_log("Head:")
-    log += print_and_log(X_mnk.head())
-    log += print_and_log("# unique mnks:")
-    log += print_and_log(str(X_mnk["mnk"].nunique().compute()) + "\n")
-
-    return X, X_mnk, Y, log, nrows_data
-
-
-# ===============================================================================
-# Predictive modelling
-def get_hyperparameter_grid(algo, model_name, n_features):
-    # Hyper-parameters to optimize
-    param_grid = dict()
-    if "scikit" in model_name:  # it is a scikit-learn model
-        if algo == "medium":
-            max_depth = [10, 13, 16, 18, 21, 24]
-            min_samples_split = [2, 8, 12, 18]
-            min_samples_leaf = [2, 8, 12, 18]
-        elif algo == "tiny":
-            step = 1
-            max_depth = range(4, int(2 * n_features) + 1, step)
-            min_samples_split = range(1, 26, step)
-            min_samples_leaf = range(1, 26, step)
-        elif algo == "small":
-            step = 3
-            max_depth = range(4, int(2 * n_features) + 1, step)
-            min_samples_split = [2, 5, 8, 13, 18]
-            min_samples_leaf = [2, 5, 8, 13, 18]
-        else:  # largeDB1,2
-            step = 3
-            max_depth = range(4, int(2 * n_features) + 1, step)
-            min_samples_split = range(2, 21, step)
-            min_samples_leaf = range(2, 21, step)
-        param_grid = {
-            model_name + "__estimator__" + "max_depth": list(max_depth),
-            model_name + "__estimator__" + "min_samples_split": list(min_samples_split),
-            model_name + "__estimator__" + "min_samples_leaf": list(min_samples_leaf),
-        }
-    elif "xgb" in model_name:  # it is an XGBOOST model
-        if algo == "medium":
-            max_depth = [16, 13]
-            n_estimators = [100, 140]
-            learning_rate = [0.1]
-        elif algo == "tiny":
-            max_depth = range(10, n_features + 2, 1)
-            n_estimators = range(30, 160, 20)
-            learning_rate = range(1, 5)
-            learning_rate = [i / 10 for i in learning_rate]
-        elif algo == "small":
-            max_max_depth = 20
-            max_depth = range(10, min(max_max_depth, n_features + 2), 4)
-            n_estimators = range(50, 200, 30)
-            learning_rate = [0.1, 0.3]
-        else:  # largeDB1,2
-            max_max_depth = 20
-            max_depth = range(10, min(max_max_depth, n_features + 2), 4)
-            n_estimators = range(50, 200, 30)
-            learning_rate = [0.1, 0.3]
-        param_grid = {
-            "max_depth": list(max_depth),
-            "learning_rate": list(learning_rate),
-            "n_estimators": list(n_estimators),
-        }
-    else:
-        raise AssertionError("Cannot recognize model: " + model_name)
-
-    return param_grid
-
-
-def get_scikit_DecisionTree_model(algo):
-    from sklearn.tree import DecisionTreeRegressor
-
-    model = DecisionTreeRegressor(
-        criterion="mse",
-        splitter="best",
-        min_samples_split=optimized_hyperparameters[algo]["scikit_min_samples_split"],
-        min_samples_leaf=optimized_hyperparameters[algo]["scikit_min_samples_leaf"],
-        max_depth=optimized_hyperparameters[algo]["scikit_max_depth"],
-        max_features=None,
-        max_leaf_nodes=None,
-    )
-    # Feature selection through permutation importance
-    from eli5.sklearn import PermutationImportance
-
-    model_perm = PermutationImportance(model, cv=None)
-    return model_perm, "scikit-Decision_Tree"
-
-
-def get_scikit_RandomForest_model(algo, njobs, ntrees):
-    from sklearn.ensemble import RandomForestRegressor
-
-    model = RandomForestRegressor(
-        criterion="mse",
-        n_estimators=ntrees,
-        min_samples_split=optimized_hyperparameters[algo]["scikit_min_samples_split"],
-        min_samples_leaf=optimized_hyperparameters[algo]["scikit_min_samples_leaf"],
-        max_depth=optimized_hyperparameters[algo]["scikit_max_depth"],
-        bootstrap=True,
-        max_features="sqrt",
-        n_jobs=njobs,
-    )
-    return model, "scikit-Random_Forest"
-
-
-def get_xgb_DecisionTree_model(algo, njobs, ntrees):
-    params = {
-        "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"],
-        "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"],
-        "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"],
-        "tree_method": "exact",
-        "verbosity": 2,
-        "objective": "reg:squarederror",
-        "booster": "gbtree",
-        "n_jobs": njobs,
-    }
-    model = xgb.XGBRegressor(**params)
-    return model, "xgb-Decision_Tree"
-
-
-def get_xgb_DecisionTree_dask_model(algo, njobs, ntrees):
-    params = {
-        "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"],
-        "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"],
-        "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"],
-        "tree_method": "exact",
-        "verbosity": 2,
-        "objective": "reg:squarederror",
-        "booster": "gbtree",
-        "n_jobs": njobs,
-    }
-    from dask_ml.xgboost import XGBRegressor_dask
-
-    model = XGBRegressor_dask(**params)
-    return model, "xgb-Decision_Tree_dask"
-
-
-def get_xgb_DecisionTree_GPU_model(algo, njobs, ntrees):
-    params = {
-        "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"],
-        "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"],
-        "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"],
-        "tree_method": "gpu_hist",
-        "verbosity": 2,
-        "objective": "reg:squarederror",
-        "booster": "gbtree",
-        "n_jobs": njobs,
-    }
-    model = xgb.XGBRegressor(**params)
-    return model, "xgb-Decision_Tree_GPU"
-
-
-def get_xgb_RandomForest_model(algo, njobs, ntrees):
-    params = {
-        "max_depth": optimized_hyperparameters[algo]["xgboost_max_depth"],
-        "learning_rate": optimized_hyperparameters[algo]["xgboost_learning_rate"],
-        "n_estimators": optimized_hyperparameters[algo]["xgboost_n_estimators"],
-        "tree_method": "exact",
-        "nthread": njobs,
-        "subsample": 0.5,
-        "colsample_bynode": 0.8,
-        "num_parallel_tree": ntrees,
-        "verbosity": 2,
-        "objective": "reg:squarederror",
-    }
-    model = xgb.XGBRFRegressor(**params)
-    return model, "xgb-Random_Forest"
-
-
-def get_model(model_to_train, algo, njobs, ntrees):
-    if model_to_train == "DT":
-        model, model_name = get_scikit_DecisionTree_model(algo)
-    elif model_to_train == "RF":
-        model, model_name = get_scikit_RandomForest_model(algo, njobs, ntrees)
-    elif model_to_train == "xgb-DT":
-        model, model_name = get_xgb_DecisionTree_model(algo, njobs, ntrees)
-    elif model_to_train == "xgb-DT-dask":
-        model, model_name = get_xgb_DecisionTree_dask_model(algo, njobs, ntrees)
-    elif model_to_train == "xgb-DT-GPU":
-        model, model_name = get_xgb_DecisionTree_GPU_model(algo, njobs, ntrees)
-    elif model_to_train == "xgb-RF":
-        model, model_name = get_xgb_RandomForest_model(algo, njobs, ntrees)
-    else:
-        raise AssertionError(
-            "Cannot recognize model: " + model_to_train + ". Options: DT, RF"
-        )
-    return model, model_name
-
-
-def get_train_test_partition(to_partition, test, train=None):
-    """
-    Perform train/test partition
-    :param to_partition: sequence of objects to partition
-    :param test: ndarray, test-indices
-    :param train (optional): ndarray
-    :return:
-    """
-    if train is None:  # Retrieve training indices
-        all_indices = set(range(len(to_partition[0].index)))
-        train = list(all_indices - set(test))
-
-    print(
-        "About to partition into train (len: {:,}) / test (len: {:,})".format(
-            len(train), len(test)
-        )
-    )
-    partitioned = list()
-    for df in to_partition:
-        df_train = df.iloc[
-            train, :
-        ]  # train: use for hyper-parameter optimization (via CV) and training
-        partitioned.append(df_train)
-        df_test = df.iloc[
-            test, :
-        ]  # test : use for evaluation of 'selected/final' model
-        partitioned.append(df_test)
-
-    print("Returning object of length: {}".format(len(partitioned)))
-    return partitioned
-
-
-def train_model(X, X_mnk, Y, algo, model_options, folder, log):
-    # ===============================================================================
-    # Get options
-    results_file = os.path.join(folder, "feature_tree.p")
-
-    # ===============================================================================
-    # Testing splitter (train/test-split)
-    from sklearn.model_selection import GroupShuffleSplit
-
-    cv = GroupShuffleSplit(n_splits=2, test_size=0.2)
-    train_test_splits = cv.split(X, Y, groups=X_mnk["mnk"])
-    train, test = next(train_test_splits)
-    (
-        X_train,
-        X_test,
-        Y_train,
-        Y_test,
-        X_mnk_train,
-        X_mnk_test,
-    ) = get_train_test_partition([X, Y, X_mnk], test, train)
-    plot_train_test_partition(test, train, X_mnk, folder)
-    log += print_and_log(
-        "\nComplete train/test split, total size="
-        + str(X.shape)
-        + ", test size="
-        + str(X_test.shape)
-        + ", train_size="
-        + str(X_train.shape)
-    )
-    del X, X_mnk, Y  # free memory
-    predictor_names = X_train.columns.values
-
-    # ===============================================================================
-    # Predictive model
-    model_to_train = model_options["model"]
-    model, model_name = get_model(
-        model_to_train, algo, model_options["njobs"], model_options["ntrees"]
-    )
-    log += print_and_log(
-        "\nStart tune/train for model " + model_name + " with parameters:"
-    )
-    log += print_and_log(model)
-
-    # ===============================================================================
-    # Cross-validation splitter (train/validation-split)
-    test_size = 0.3
-    cv = GroupShuffleSplit(n_splits=model_options["splits"], test_size=test_size)
-
-    # ===============================================================================
-    # Feature selection: SelectFromModel
-    from sklearn.feature_selection import SelectFromModel
-
-    feature_importance_threshold = (
-        0.0005  # only remove the features with VERY little importance
-    )
-    model.cv = cv.split(X_train.values, Y_train.values, groups=X_mnk_train.values)
-    model.fit(X_train.values, Y_train.values)
-    model_fs = SelectFromModel(
-        model, threshold=feature_importance_threshold, max_features=None, prefit=True
-    )
-    print(model_fs)
-    model.cv = None
-
-    # ===============================================================================
-    # Info on feature selection
-    all_feature_names = X_train.columns.values.tolist()
-    feature_support = model_fs.get_support()
-    features_importances = model.feature_importances_
-    feature_name_importance = zip(
-        all_feature_names, features_importances, feature_support
-    )
-    feature_name_importance = sorted(
-        feature_name_importance, key=lambda x: x[1], reverse=True
-    )
-
-    log += print_and_log(visual_separator)
-    n_selected_features = np.sum(feature_support)
-    log += print_and_log("Optimal number of features : {}".format(n_selected_features))
-
-    # Selected features
-    log += print_and_log("\nFeatures:")
-    selected_features = list()
-    selected_feature_importances = list()
-    for i, (feat_name, feat_imp, feat_in) in enumerate(feature_name_importance):
-        in_or_out = "accepted" if feat_in else " x rejected"
-        log += print_and_log(
-            "{:>2}) {:<40}, imp: {:>1.3f} {}".format(
-                i + 1, feat_name, feat_imp, in_or_out
-            )
-        )
-        if feat_in:
-            selected_features.append(feat_name)
-            selected_feature_importances.append(feat_imp)
-    plot_feature_importance(features_importances, all_feature_names, folder)
-
-    # Drop non-selected features
-    features_to_drop = [f for f in predictor_names if f not in selected_features]
-    X_train = X_train.drop(features_to_drop, axis=1)
-    X_test = X_test.drop(features_to_drop, axis=1)
-    n_features = len(X_train.columns)
-
-    # ===============================================================================
-    # Fit
-    out_of_memory_computation = "dask" in model_options["model"]
-    if out_of_memory_computation:
-        X_train, Y_train = pandas_to_dask(X_train, Y_train)
-
-    if model_options["hyperparameter_optimization"]:
-        # Hyperparameter Optimization
-        param_grid = get_hyperparameter_grid(algo, model_name, n_features)
-        if param_grid is None:
-            raise AssertionError("param_grid object is None. Please implement!")
-
-        # At this point, we "cheat"/"take a shortcut" in 2 ways:
-        # - we split into train/test partitions using the simple default splitter, not one that is aware of mnk-groups
-        # - we use an overall MSE scorer, not one that looks at the performance loss of predicted mnks wrt. autotuned
-        if out_of_memory_computation:
-            from dask_ml.model_selection import GridSearchCV
-
-            gds_pars = {
-                "estimator": model,
-                "param_grid": param_grid,
-                "cv": model_options["splits"],
-                "refit": True,
-                "n_jobs": 1,
-            }
-        else:
-            from sklearn.model_selection import GridSearchCV
-
-            gds_pars = {
-                "estimator": model,
-                "param_grid": param_grid,
-                "cv": model_options["splits"],
-                "refit": True,
-                "n_jobs": 1,
-                "verbose": 2,
-            }
-        gds = GridSearchCV(**gds_pars)
-        log += print_and_log(visual_separator)
-        log += print_and_log("\nStart hyperparameter optimization & training ... :\n")
-        log += print_and_log("Hyper-parameter grid:")
-        for par, values in param_grid.items():
-            log += print_and_log("\t" + par + ": " + str(values))
-        log += print_and_log("\n")
-        gds.fit(X_train.values, Y_train.values)
-        log += print_and_log("... done")
-        describe_hpo(gds, log, folder)
-        model = gds.best_estimator_
-
-    else:
-        # Fit
-        log += print_and_log(visual_separator)
-        log += print_and_log("\nStart fitting model with predictors:\n")
-        for i, p in enumerate(X_train.columns.values):
-            log += print_and_log("\t{:>2}) {}".format(i + 1, p))
-
-        model.fit(X_train, Y_train)
-
-    safe_pickle([X_train.columns.values, model, test], results_file)
-    log += print_and_log("\nCompleted fit, wrote results to " + results_file)
-    log += print_and_log(visual_separator)
-    return_model = model
-
-    # Return
-    if "mnk" in X_train.columns.values:
-        X_train.drop("mnk", axis=1, inplace=True)
-    if "mnk" in X_test.columns.values:
-        X_train.drop("mnk", axis=1, inplace=True)
-
-    if out_of_memory_computation:
-        X_train, Y_train = dask_to_pandas(X_train, Y_train)
-
-    return X_train, Y_train, X_mnk_train, X_test, Y_test, X_mnk_test, return_model, log
-
-
-def fetch_pre_trained_model(model_path_folder, X, log):
-    model_path = os.path.join(model_path_folder, "feature_tree_refit.p")
-    print("fetched pre-trained model from: {}".format(model_path))
-    features, model = safe_pickle_load(model_path)
-    print("Pickled variables:\nfeatures:{}\nmodel:{}".format(features, model))
-
-    log += print_and_log("\nDrop non-selected features")
-    predictor_names = X.columns.values.tolist()
-    features_to_drop = [f for f in predictor_names if f not in features]
-    X.drop(features_to_drop, axis=1, inplace=True)
-    return X, model, log
-
-
-def fetch_pre_trained_model_partial(X, X_mnk, Y, model_options, model_path_folder, log):
-    # Load pre-trained model, selected features and indices of test-set
-    model_path = os.path.join(model_path_folder, "feature_tree.p")
-    print("fetched partial pre-trained model from: {}".format(model_path))
-    features, model, test_indices = safe_pickle_load(model_path)
-    print(
-        "Pickled stuff:\nfeatures:{}\nmodel:{}\ntest_indices:{}".format(
-            features, model, test_indices
-        )
-    )
-    if "mnk" in features:
-        features.remove("mnk")
-
-    log += print_and_log("\nPerform train/test split")
-    (
-        X_train,
-        X_test,
-        Y_train,
-        Y_test,
-        X_mnk_train,
-        X_mnk_test,
-    ) = get_train_test_partition([X, Y, X_mnk], test_indices)
-    log += print_and_log(
-        "\nComplete train/test split, total size="
-        + str(X.shape)
-        + ", test size="
-        + str(X_test.shape)
-        + ", train_size="
-        + str(X_train.shape)
-    )
-
-    log += print_and_log("\nDrop non-selected features")
-    predictor_names = X_train.columns.values.tolist()
-    features_to_drop = [f for f in predictor_names if f not in features]
-    X_train.drop(features_to_drop, axis=1, inplace=True)
-    X_test.drop(features_to_drop, axis=1, inplace=True)
-
-    out_of_memory_computation = "dask" in model_options["model"]
-    if out_of_memory_computation:
-        X_train, Y_train = pandas_to_dask(X_train, Y_train)
-
-    return X_train, Y_train, X_mnk_train, X_test, Y_test, X_mnk_test, model, log
-
-
-# ===============================================================================
-# Describe and evaluate model
-def describe_hpo(gs, log, folder):
-    # Scores obtained during hyperparameter optimization
-    columns_to_print = list()
-    for par in gs.param_grid.keys():
-        columns_to_print.append("param_" + par)
-    columns_to_print += [
-        "mean_test_score",
-        "std_test_score",
-        "mean_train_score",
-        "std_train_score",
-    ]
-    log += print_and_log("\nHyperparameter search results (head):")
-    cv_results = pd.DataFrame(gs.cv_results_)[columns_to_print]
-    with pd.option_context("display.max_rows", None, "display.max_columns", None):
-        log += print_and_log(cv_results.head())
-    cv_results_path = os.path.join(folder, "hyperparameter_optimization_results.csv")
-    with open(cv_results_path, "w") as f:
-        cv_results.to_csv(f, index=False)
-    log += print_and_log("Wrote hyperparameter results to " + cv_results_path)
-
-    # Best parameter set
-    log += print_and_log("\nBest parameters set found on development set:")
-    for bestpar_name, bestpar_value in gs.best_params_.items():
-        log += print_and_log("\t{}: {}".format(bestpar_name, bestpar_value))
-
-    # Best estimator
-    log += print_and_log("\nBest estimator:")
-    best_estimator = gs.best_estimator_
-    log += print_and_log(best_estimator)
-    log += print_and_log(visual_separator)
-
-    return log
-
-
-def describe_model(model, X, Y, log):
-    predictor_names = X.columns.values.tolist()
-    log += print_and_log("Model:")
-    log += print_and_log(model)
-
-    log += print_and_log("Predictor variables:")
-    for p in predictor_names:
-        log += print_and_log("\t{}".format(p))
-
-    return log
-
-
-def print_custom_error(y_true, y_pred, X_mnk, log, scaled=True):
-    result_line = (
-        "\tRelative performance loss compared to autotuned max:\n"
-        + "top-{}: worse: {:>6.3f} [%], mean: {:>6.3f} [%]"
-    )
-    for top_k in [1]:
-        log += print_and_log(
-            result_line.format(
-                top_k,
-                worse_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled),
-                mean_rel_perf_loss_of_k(y_true, y_pred, top_k, X_mnk, scaled),
-            )
-        )
-    return log
-
-
-def print_error(y_true, y_pred, log):
-    from sklearn.metrics import mean_absolute_error, mean_squared_error
-
-    result_line = "\tOverall error:\n" + "absolute: {:>6.3f}, mean squared {:>6.3f}"
-    log += print_and_log(
-        result_line.format(
-            mean_absolute_error(y_true, y_pred), mean_squared_error(y_true, y_pred)
-        )
-    )
-    return log
-
-
-def scale_back(y_scaled, x_mnk, max_performances, mnk=None):
-    if mnk is None:
-        corresponding_maxperf = np.array(
-            [max_performances[mnk] for mnk in x_mnk["mnk"].values.tolist()]
-        )
-    else:
-        corresponding_maxperf = max_performances[mnk]
-    return y_scaled * corresponding_maxperf
-
-
-def plot_train_test_partition(test_idx, train_idx, X_mnk, folder):
-    import matplotlib.pyplot as plt
-
-    mnks_string_train = X_mnk["mnk"].iloc[train_idx].unique()
-    mnks_train = to_tuple(*mnks_string_train)
-    mnks_string_test = X_mnk["mnk"].iloc[test_idx].unique()
-    mnks_test = to_tuple(*mnks_string_test)
-
-    y_train_product = (
-        dict()
-    )  # keys: m*n*k, values: how many times this mnk-product appears in training-mnks
-    for m, n, k in mnks_train:
-        mxnxk = m * n * k
-        if mxnxk in y_train_product.keys():
-            y_train_product[mxnxk] += 1
-        else:
-            y_train_product[mxnxk] = 1
-
-    train_mnks = list()
-    train_counts = list()
-    for mnk, count in y_train_product.items():
-        for c in range(count):
-            train_mnks.append(mnk)
-            train_counts.append(c + 1)
-
-    y_test_product = dict()
-    for m, n, k in mnks_test:
-        mxnxk = m * n * k
-        if mxnxk in y_test_product.keys():
-            y_test_product[mxnxk] += 1
-        else:
-            y_test_product[mxnxk] = 1
-
-    test_mnks = list()
-    test_counts = list()
-    for mnk, count in y_test_product.items():
-        for c in range(count):
-            test_mnks.append(mnk)
-            if mnk in y_train_product.keys():
-                test_counts.append(y_train_product[mnk] + c + 1)
-            else:
-                test_counts.append(c + 1)
-
-    plt.figure(figsize=(30, 5))
-    markersize = 12
-    plt.plot(
-        train_mnks,
-        train_counts,
-        "o",
-        markersize=markersize,
-        color="blue",
-        label="training mnks (" + str(len(train_mnks)) + ")",
-    )
-    plt.plot(
-        test_mnks,
-        test_counts,
-        "o",
-        markersize=markersize,
-        color="red",
-        label="testing mnks (" + str(len(test_mnks)) + ")",
-    )
-    plot_file_path = os.path.join(folder, "train-test_split.svg")
-    plt.xlabel("m * n * k triplets")
-    plt.ylabel("number of occurences in data set")
-    plt.title("Train/test split")
-    maxcount = max(max(test_counts), max(train_counts)) + 1
-    plt.ylim([0, maxcount])
-    plt.legend()
-    plt.savefig(plot_file_path)
-
-
-def plot_feature_importance(importances, names, folder):
-    plt.rcdefaults()
-    fig, ax = plt.subplots()
-
-    ax.set_title("Feature importances")
-    ax.barh(range(len(names)), importances, color="g", align="center")
-    ax.set_yticks(np.arange(len(importances)))
-    ax.set_yticklabels(names)
-    ax.invert_yaxis()
-    plot_file_path = os.path.join(folder, "feature_importance.svg")
-    plt.savefig(plot_file_path)
-    print(plot_file_path)
-
-
-def plot_loss_histogram(y_true, y_pred, X_mnk, folder):
-    import matplotlib.pyplot as plt
-
-    # Get losses
-    top_k = 1
-    y = np.array(perf_loss(y_true, y_pred, top_k, X_mnk, False))
-
-    # Losses-histogram
-    num_bins = 100
-    plt.figure()
-    plt.hist(y, num_bins, facecolor="green", alpha=0.75)
-    plt.xlabel("relative performance loss [%]")
-    plt.ylabel("# occurrences")
-    plt.title(
-        "Performance losses for top-k="
-        + str(top_k)
-        + " ("
-        + str(len(y))
-        + " test mnks)"
-    )
-    plot_file_path = os.path.join(folder, "result_losses.svg")
-    plt.savefig(plot_file_path)
-    print(plot_file_path)
-
-
-def plot_prediction_accuracy(m, n, k, y_true, y_pred, train, pp):
-    plt.figure()
-    if train:
-        plt.plot(100 * y_true, 100 * y_pred, "b.", label="truth")
-    else:
-        plt.plot(100 * y_true, 100 * y_pred, "r.", label="truth")
-    plt.xlabel("true scaled performance [%]")
-    plt.ylabel("predicted scaled performance [%]")
-    type = "train" if train else "test"
-    plt.title("Prediction accuracy for kernel " + str((m, n, k)) + " (" + type + ")")
-    pp.savefig()
-
-
-def get_predive_model_performances(
-    y_true, y_pred, x_mnk, max_performances_ref, max_performances_algo
-):
-    predictive_model_perf_scaled = dict()
-
-    for mnk_string in x_mnk["mnk"].unique():
-        idx_mnk = np.where(x_mnk == mnk_string)[0].tolist()
-        assert len(idx_mnk) > 0, "idx_mnk is empty"
-        m, n, k = to_tuple(mnk_string)
-
-        perf_chosen_idx = [np.argmax(y_pred[idx_mnk])]
-        perf_effective = y_true.iloc[idx_mnk].iloc[perf_chosen_idx].values.item()
-        predictive_model_perf_scaled[(m, n, k)] = (
-            perf_effective  # 'scaled' between 0 and 1
-        )
-
-    predictive_model_perf = dict(
-        zip(
-            predictive_model_perf_scaled.keys(),
-            [
-                perf_scaled * max_performances_ref[to_string(mnk)]
-                for mnk, perf_scaled in predictive_model_perf_scaled.items()
-            ],
-        )
-    )
-
-    # Re-scale performances by algorithm for a fair comparison
-    predictive_model_perf_scaled = dict(
-        zip(
-            predictive_model_perf.keys(),
-            [
-                perf / max_performances_algo[mnk]
-                for mnk, perf in predictive_model_perf.items()
-            ],
-        )
-    )
-
-    return predictive_model_perf, predictive_model_perf_scaled
-
-
-# ===============================================================================
-def evaluate_model(
-    model,
-    X_train,
-    X_mnk_train,
-    Y_train,
-    X_test,
-    X_mnk_test,
-    Y_test,
-    max_performances_ref,
-    max_performances_algo,
-    baseline_performances_algo,
-    data_nrows,
-    log,
-    folder,
-):
-    """Main evaluation function"""
-    if model is None:
-        return log
-
-    # Start evaluation
-    log += print_and_log(visual_separator)
-    log += print_and_log("Start model evaluation")
-    if all([x is not None for x in [X_test, Y_test]]):
-        log = describe_model(model, X_test, Y_test, log)
-
-    # Training error
-    if all([x is not None for x in [X_train, X_mnk_train, Y_train]]):
-        y_train_pred = model.predict(X_train.values)
-        log += print_and_log("\nTraining error: (train&val)")
-        log = print_custom_error(Y_train, y_train_pred, X_mnk_train, log, True)
-        log = print_error(Y_train, y_train_pred, log)
-
-        # Test error
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            y_test_pred = model.predict(X_test)
-            log += print_and_log("\nTesting error:")
-            log = print_custom_error(Y_test, y_test_pred, X_mnk_test, log, True)
-            log = print_error(Y_test, y_test_pred, log)
-
-    # Training error (scaled-back)
-    if all([x is not None for x in [X_train, X_mnk_train, Y_train]]):
-        log += print_and_log("\nTraining error (scaled back): (train&val)")
-        y_train_pred_scaled_back = scale_back(
-            y_train_pred, X_mnk_train, max_performances_ref
-        )
-        y_train_scaled_back = pd.DataFrame(
-            scale_back(Y_train.values.flatten(), X_mnk_train, max_performances_ref)
-        )
-        log = print_custom_error(
-            y_train_scaled_back, y_train_pred_scaled_back, X_mnk_train, log, False
-        )
-        log = print_error(y_train_scaled_back, y_train_pred_scaled_back, log)
-
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            # Test error (scaled-back)
-            log += print_and_log("\nTesting error (scaled back): (test&val)")
-            y_test_pred_scaled_back = scale_back(
-                y_test_pred, X_mnk_test, max_performances_ref
-            )
-            y_test_scaled_back = pd.DataFrame(
-                scale_back(Y_test.values.flatten(), X_mnk_test, max_performances_ref)
-            )
-            log = print_custom_error(
-                y_test_scaled_back, y_test_pred_scaled_back, X_mnk_test, log, False
-            )
-            log = print_error(y_test_scaled_back, y_test_pred_scaled_back, log)
-
-    # ===============================================================================
-    # Print histogram for "best" estimator
-    if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-        log += print_and_log("\nPlot result histogram:")
-        plot_loss_histogram(Y_test, y_test_pred, X_mnk_test, folder)
-
-    # ===============================================================================
-    # Plot prediction accuracy and goodness of choice for a few mnks (training-set)
-    if all([x is not None for x in [X_train, X_mnk_train, Y_train]]):
-        n_samples = 10 if data_nrows < 100000000 else 2
-        mnks_to_plot = random.sample(X_mnk_train["mnk"].values.tolist(), n_samples)
-
-        from matplotlib.backends.backend_pdf import PdfPages
-
-        plot_file_path = os.path.join(folder, "evaluation_by_mnk_refit.pdf")
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            plot_file_path = os.path.join(folder, "evaluation_by_mnk.pdf")
-        pp = PdfPages(plot_file_path)
-
-        for mnk_string in mnks_to_plot:
-            # Get performances per mnk
-            idx_mnk = np.where(X_mnk_train == mnk_string)[0].tolist()
-            assert len(idx_mnk) > 0, "idx_mnk is empty"
-            m_, n_, k_ = to_tuple(mnk_string)
-            y_train_pred_mnk = y_train_pred[idx_mnk]
-            Y_train_mnk = Y_train.iloc[idx_mnk]
-
-            log += print_and_log("Prediction accuracy plot: " + str(mnk_string))
-
-            plot_prediction_accuracy(
-                m_, n_, k_, Y_train_mnk, y_train_pred_mnk, True, pp
-            )
-
-            log += print_and_log("Goodness plot: " + str(mnk_string))
-            plot_choice_goodness(
-                m_,
-                n_,
-                k_,
-                baseline_performances_algo,
-                max_performances_ref,
-                Y_train["perf_scaled"].iloc[idx_mnk].values,
-                y_train_pred_mnk,
-                True,
-                pp,
-            )
-
-        # ===============================================================================
-        # Plot prediction accuracy for a few mnks (testing-set)
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            mnks_to_plot = random.sample(X_mnk_test["mnk"].values.tolist(), n_samples)
-            for mnk_string in mnks_to_plot:
-                # Get performances per mnk
-                idx_mnk = np.where(X_mnk_test == mnk_string)[0].tolist()
-                assert len(idx_mnk) > 0, "idx_mnk is empty"
-                m_, n_, k_ = to_tuple(mnk_string)
-
-                log += print_and_log("Prediction accuracy plot: " + str(mnk_string))
-                plot_prediction_accuracy(
-                    m_, n_, k_, Y_test.iloc[idx_mnk], y_test_pred[idx_mnk], False, pp
-                )
-
-                log += print_and_log("Goodness plot: " + str(mnk_string))
-                plot_choice_goodness(
-                    m_,
-                    n_,
-                    k_,
-                    baseline_performances_algo,
-                    max_performances_ref,
-                    Y_test["perf_scaled"].iloc[idx_mnk].values,
-                    y_test_pred[idx_mnk],
-                    False,
-                    pp,
-                    True,
-                )
-
-        if all([x is not None for x in [X_train, X_mnk_train, Y_train]]):
-            pp.close()
-
-    # ===============================================================================
-    # Scale baseline and max performances
-    max_performances_algo = dict(
-        zip(
-            [to_tuple(mnk_string) for mnk_string in max_performances_algo.keys()],
-            max_performances_algo.values(),
-        )
-    )
-    max_performances_algo_scaled = dict(
-        zip(max_performances_algo.keys(), [1.0] * len(max_performances_algo))
-    )
-    baseline_performances_algo = dict(
-        zip(
-            [to_tuple(mnk_string) for mnk_string in baseline_performances_algo.keys()],
-            baseline_performances_algo.values(),
-        )
-    )
-    baseline_performances_algo_scaled = dict(
-        zip(
-            [(m, n, k) for m, n, k in baseline_performances_algo.keys()],
-            [
-                perf / max_performances_algo[(m, n, k)]
-                for (m, n, k), perf in baseline_performances_algo.items()
-            ],
-        )
-    )
-
-    # ===============================================================================
-    # Compare max performances and baseline
-    from matplotlib.backends.backend_pdf import PdfPages
-
-    plot_file_path = os.path.join(folder, "evaluation_by_overall_refit.pdf")
-    if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-        plot_file_path = os.path.join(folder, "evaluation_overall.pdf")
-    pp = PdfPages(plot_file_path)
-
-    if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-        plot_performance_gains(
-            max_performances_algo,
-            baseline_performances_algo,
-            "trained",
-            "max. performance per algorithm",
-            "baseline per algorithm",
-            pp,
-        )
-        plot_scaled_performance_gains(
-            max_performances_algo_scaled,
-            baseline_performances_algo_scaled,
-            "trained",
-            "max. performance per algorithm",
-            "baseline per algorithm",
-            pp,
-        )
-
-    # ===============================================================================
-    # 'Results' = y_true ( y_chosen )
-    if all([x is not None for x in [X_train, X_mnk_train, Y_train]]):
-        (
-            predictive_model_perf_train,
-            predictive_model_perf_train_scaled,
-        ) = get_predive_model_performances(
-            Y_train,
-            y_train_pred,
-            X_mnk_train,
-            max_performances_ref,
-            max_performances_algo,
-        )
-
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            (
-                predictive_model_perf_test,
-                predictive_model_perf_test_scaled,
-            ) = get_predive_model_performances(
-                Y_test,
-                y_test_pred,
-                X_mnk_test,
-                max_performances_ref,
-                max_performances_algo,
-            )
-
-    # ===============================================================================
-    # Plot results (training set: predictive modelling VS naïve)
-    log += print_and_log("\nPredictive model VS baseline: ")
-
-    if all([x is not None for x in [X_train, X_mnk_train, Y_train]]):
-        perf_gain_pred_train_over_baseline = performance_gain(
-            baseline_performances_algo, predictive_model_perf_train
-        )
-        plot_absolute_performance_gain(
-            perf_gain_pred_train_over_baseline,
-            "trained",
-            "baseline per algorithm",
-            "predictive model",
-            pp,
-        )
-
-        scaled_perf_gain_pred_train_over_baseline = performance_gain(
-            baseline_performances_algo_scaled, predictive_model_perf_train_scaled
-        )
-        plot_relative_performance_gain(
-            scaled_perf_gain_pred_train_over_baseline,
-            "trained",
-            "baseline per algorithm",
-            "predictive model",
-            pp,
-        )
-
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            perf_gain_pred_test_over_baseline = performance_gain(
-                baseline_performances_algo, predictive_model_perf_test
-            )
-            plot_absolute_performance_gain(
-                perf_gain_pred_test_over_baseline,
-                "tested",
-                "baseline per algorithm",
-                "predictive model",
-                pp,
-            )
-
-            scaled_perf_gain_pred_test_over_baseline = performance_gain(
-                baseline_performances_algo_scaled, predictive_model_perf_test_scaled
-            )
-            plot_relative_performance_gain(
-                scaled_perf_gain_pred_test_over_baseline,
-                "tested",
-                "baseline per algorithm",
-                "predictive model",
-                pp,
-            )
-
-            log += print_and_log("\nPredictive model VS autotuned: ")
-            perf_gain_pred_train_over_max = performance_gain(
-                max_performances_algo, predictive_model_perf_train
-            )
-            plot_absolute_performance_gain(
-                perf_gain_pred_train_over_max,
-                "trained",
-                "max. performance per algorithm",
-                "predictive model",
-                pp,
-            )
-            scaled_perf_gain_pred_train_over_max = performance_gain(
-                max_performances_algo_scaled, predictive_model_perf_train_scaled
-            )
-            plot_relative_performance_gain(
-                scaled_perf_gain_pred_train_over_max,
-                "trained",
-                "max. performance per algorithm",
-                "predictive model",
-                pp,
-            )
-
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            perf_gain_pred_test_over_max = performance_gain(
-                max_performances_algo, predictive_model_perf_test
-            )
-            plot_absolute_performance_gain(
-                perf_gain_pred_test_over_max,
-                "tested",
-                "max. performance per algorithm",
-                "predictive model",
-                pp,
-            )
-            scaled_perf_gain_pred_test_over_max = performance_gain(
-                max_performances_algo_scaled, predictive_model_perf_test_scaled
-            )
-            plot_relative_performance_gain(
-                scaled_perf_gain_pred_test_over_max,
-                "tested",
-                "max. performance per algorithm",
-                "predictive model",
-                pp,
-            )
-
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            log += print_and_log("\nCompare performances: ")
-            plot_performance_gains(
-                baseline_performances_algo,
-                predictive_model_perf_train,
-                "trained",
-                "baseline per algorithm",
-                "predictive model",
-                pp,
-            )
-            plot_performance_gains(
-                max_performances_algo,
-                predictive_model_perf_train,
-                "trained",
-                "max. performance per algorithm",
-                "predictive model",
-                pp,
-            )
-
-        if all([x is not None for x in [X_test, X_mnk_test, Y_test]]):
-            plot_performance_gains(
-                baseline_performances_algo,
-                predictive_model_perf_test,
-                "tested",
-                "baseline per algorithm",
-                "predictive model",
-                pp,
-            )
-            plot_performance_gains(
-                max_performances_algo,
-                predictive_model_perf_test,
-                "tested",
-                "max. performance per algorithm",
-                "predictive model",
-                pp,
-            )
-
-        pp.close()
-
-    return log
-
-
-# ===============================================================================
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""
-        Train a Machine Learning model on autotuning data to predict a kernel's performance given
-        its template parameters
-
-
-        This script is part of the workflow for predictive modelling of optimal libsmm_acc parameters.
-        For more details, see README.md.
-        """,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "-d",
-        "--destination_folder",
-        metavar="FOLDER",
-        type=str,
-        default=".",
-        help="Folder in which to write plots, models, etc.",
-    )
-    parser.add_argument(
-        "-f",
-        "--folder",
-        metavar="FOLDER",
-        type=str,
-        default=".",
-        help="Folder from which to read data",
-    )
-    parser.add_argument(
-        "-a", "--algo", metavar="algoname", default="", help="Algorithm to train on"
-    )
-    parser.add_argument(
-        "-m",
-        "--model",
-        default="DT",
-        help="Model to train. Options: DT (Decision Trees), RF (Random Forests), xgb-DT, xgb-DT-dask (out-of-memory"
-        + "xgboost), xgb-DT-GPU (with GPU support), xgb-RF",
-    )
-    parser.add_argument(
-        "-o",
-        "--hyperparameter_optimization",
-        default=False,
-        help="Whether to do hyperparameter optimization. If False, the model will be trained with 'best guess' parameters",
-    )
-    parser.add_argument(
-        "-s",
-        "--splits",
-        default=3,
-        metavar="NUMBER",
-        type=int,
-        help="Number of cross-validation splits used in RFECV and GridSearchCV",
-    )
-    parser.add_argument(
-        "-e",
-        "--ntrees",
-        default=3,
-        metavar="NUMBER",
-        type=int,
-        help="Number of estimators in RF",
-    )
-    parser.add_argument(
-        "-j",
-        "--njobs",
-        default=-1,
-        metavar="NUMBER",
-        type=int,
-        help="Number of parallel jobs that Joblib will launch (used by GridSearchCV and XGBoost)",
-    )
-    parser.add_argument(
-        "-r",
-        "--nrows",
-        default=None,
-        metavar="NUMBER",
-        type=int,
-        help="Number of rows of data to load. Default: None (load all)",
-    )
-    parser.add_argument(
-        "-g",
-        "--prefitted_model",
-        metavar="filename",
-        default="",
-        help="Path to pickled model object to load instead of re-training model",
-    )
-    parser.add_argument(
-        "-i",
-        "--intermediate_evaluation",
-        default=False,
-        help="Whether to perform evaluation of the model trained on part of the model",
-    )
-    parser.set_defaults(intermediate_evaluation=False)
-
-    args = parser.parse_args()
-    model_args = {
-        "model": args.model,
-        "splits": args.splits,
-        "ntrees": args.ntrees,
-        "njobs": args.njobs,
-        "hyperparameter_optimization": args.hyperparameter_optimization,
-    }
-    main(
-        args.folder,
-        args.destination_folder,
-        args.algo,
-        model_args,
-        args.nrows,
-        args.prefitted_model,
-        args.intermediate_evaluation,
-    )
diff --git a/src/acc/libsmm_acc/predict/prepare_training_data.py b/src/acc/libsmm_acc/predict/prepare_training_data.py
deleted file mode 100755
index d8240d9e2d4..00000000000
--- a/src/acc/libsmm_acc/predict/prepare_training_data.py
+++ /dev/null
@@ -1,832 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-####################################################################################################
-# Copyright (C) by the DBCSR developers group - All rights reserved                                #
-# This file is part of the DBCSR library.                                                          #
-#                                                                                                  #
-# For information on the license, see the LICENSE file.                                            #
-# For further information please visit https://dbcsr.cp2k.org                                      #
-# SPDX-License-Identifier: GPL-2.0+                                                                #
-####################################################################################################
-
-
-import sys
-import os
-import json
-import argparse
-import numpy as np
-import pandas as pd
-import dask.dataframe as dd
-from joblib import Parallel, delayed
-from tqdm import tqdm
-
-sys.path.append("../")
-
-from kernels.smm_acc import kernel_algorithm, mnk_pattern  # noqa: E402
-from kernels.smm_acc_predict import (  # noqa: E402
-    PredictiveParameters,
-    derived_parameters,
-)
-
-
-# ===============================================================================
-# HELPER FUNCTIONS
-# ===============================================================================
-def update_maximums(dictionnary_to_update, dictionnary_partial):
-    for mnk, new_perf in dictionnary_partial.items():
-        if mnk in dictionnary_to_update.keys():
-            if new_perf > dictionnary_to_update[mnk]:
-                dictionnary_to_update[mnk] = new_perf
-        else:
-            dictionnary_to_update[mnk] = new_perf
-    return dictionnary_to_update
-
-
-# ===============================================================================
-def get_idx_baseline(data_mnk, algorithm, baseline_pars):
-    if algorithm in ["tiny"]:
-        idx_baseline = data_mnk[
-            (data_mnk.m == baseline_pars["m"])
-            & (data_mnk.n == baseline_pars["n"])
-            & (data_mnk.k == baseline_pars["k"])
-            & (data_mnk.threads == baseline_pars["threads"])
-            & (data_mnk.grouping == baseline_pars["grouping"])
-            & (data_mnk.minblocks == baseline_pars["minblocks"])
-        ].index.tolist()
-    elif algorithm in ["small", "medium"]:
-        idx_baseline = data_mnk[
-            (data_mnk.m == baseline_pars["m"])
-            & (data_mnk.n == baseline_pars["n"])
-            & (data_mnk.k == baseline_pars["k"])
-            & (data_mnk.threads == baseline_pars["threads"])
-            & (data_mnk.grouping == baseline_pars["grouping"])
-            & (data_mnk.minblocks == baseline_pars["minblocks"])
-            & (data_mnk.tile_m == baseline_pars["tile_m"])
-            & (data_mnk.tile_n == baseline_pars["tile_n"])
-        ].index.tolist()
-    else:  # algorithm is largeDB1 or largeDB2
-        idx_baseline = data_mnk[
-            (data_mnk.m == baseline_pars["m"])
-            & (data_mnk.n == baseline_pars["n"])
-            & (data_mnk.k == baseline_pars["k"])
-            & (data_mnk.threads == baseline_pars["threads"])
-            & (data_mnk.minblocks == baseline_pars["minblocks"])
-            & (data_mnk.tile_m == baseline_pars["tile_m"])
-            & (data_mnk.tile_n == baseline_pars["tile_n"])
-            & (data_mnk.w == baseline_pars["w"])
-            & (data_mnk.v == baseline_pars["v"])
-        ].index.tolist()
-    return idx_baseline
-
-
-def get_performance_closest_to_baseline(
-    data, algorithm, mnk, gpu_properties, autotuning_properties
-):
-    """
-    Sometimes, the so-called "baseline" parameter set does not appear in the training data.
-    This function finds the performance of the parameter set from the training data whose parameters are closest to those of the
-    baseline parameter sets.
-    """
-    m, n, k = mnk_pattern.match(mnk).groups()
-    m, n, k = int(m), int(n), int(k)
-
-    data_mnk = data[(data["m"] == m) & (data["n"] == n) & (data["k"] == k)].compute()
-    baseline_pars = kernel_algorithm[algorithm].baseline(
-        m, n, k, gpu_properties, autotuning_properties
-    )
-
-    # Get performance of baseline parameters for this algorithm & this mnk:
-    idx_baseline = get_idx_baseline(data_mnk, algorithm, baseline_pars)
-
-    # Get performance of baseline parameters for this algorithm & this mnk:
-    if len(idx_baseline) == 0:
-        # Generate space of possibilities
-        pars_sets = kernel_algorithm[algorithm].promising_parameters(
-            m, n, k, gpu_properties, autotuning_properties
-        )
-        # Sort space by distance to baseline set
-        pars_sets.sort(
-            key=lambda x: kernel_algorithm[algorithm].parameter_set_distance(
-                x, baseline_pars
-            )
-        )
-
-        for pars_set in pars_sets:
-            idx_baseline = get_idx_baseline(data_mnk, algorithm, pars_set)
-            if len(idx_baseline) > 0:
-                break
-        else:
-            raise AssertionError(
-                f'Could not find closest baseline for mnk=({m}x{n}x{k}) and for algorithm "{algorithm}.\n'
-                f"Last baseline parameters searched:\n{baseline_pars}\n"
-                f"Parameter sets searched:\n"
-            )
-
-    idx_baseline = idx_baseline[0]
-    baseline_perf = data_mnk["perf (Gflop/s)"][idx_baseline]
-    return round(baseline_perf, 3)
-
-
-def process_chunk(data_chunk, algorithm, gpu_properties, autotuning_properties):
-    """
-    Given a chunk of data, compute the baseline and maximum performance of the (m, n, k)-triplets featured in the chunk of data.
-    """
-    # Add "mnk" column
-    data_chunk["mnk"] = (
-        data_chunk["m"].astype(str)
-        + "x"
-        + data_chunk["n"].astype(str)
-        + "x"
-        + data_chunk["k"].astype(str)
-    )
-    # Get mnks
-    mnks = data_chunk["mnk"].unique()
-
-    # For each (mnk), ...
-    baseline_performances = dict()
-    max_performances = dict()
-    for mnk in mnks:
-        data_mnk = data_chunk[data_chunk["mnk"] == mnk]
-        m, n, k = mnk_pattern.match(mnk).groups()
-        m, n, k = int(m), int(n), int(k)
-
-        # Get baseline configuration for this algorithm & this mnk:
-        baseline_pars = kernel_algorithm[algorithm].baseline(
-            m, n, k, gpu_properties, autotuning_properties
-        )
-
-        # Get performance of baseline parameters for this algorithm & this mnk:
-        idx_baseline = get_idx_baseline(data_mnk, algorithm, baseline_pars)
-        if len(idx_baseline) < 1:
-            baseline_perf = 0
-        else:
-            idx_baseline = idx_baseline[0]
-            baseline_perf = data_mnk["perf (Gflop/s)"][idx_baseline]
-
-        baseline_performances[mnk] = round(baseline_perf, 3)
-
-        # Get max performance for this algorithm & this mnk
-        max_perf = data_mnk["perf (Gflop/s)"].max()
-        max_performances[mnk] = round(max_perf, 3)
-
-    return baseline_performances, max_performances
-
-
-# ===============================================================================
-def write_to_parquet(data_path, algorithm):
-    """
-    Compress CSV files to parquet
-    """
-    # Check whether the files corresponding to this algorithm have been compressed to parquet already
-    parquet_file = os.path.join(data_path, "training_data_" + algorithm + ".parquet")
-    parquet_file_done = os.path.join(
-        data_path, "training_data_" + algorithm + ".parquet.done"
-    )
-    print(
-        "\n\n------------------------------------------------------------------------"
-    )
-    if os.path.exists(parquet_file_done):
-        print("Found {:40}, skipping".format(parquet_file_done))
-
-    else:
-        print("Didn't find {:40}, generating".format(parquet_file_done))
-
-        # [RAW] Read CSV files into Pandas dataframes
-        data_file_raw = os.path.join(
-            data_path, "raw_training_data_" + algorithm + ".csv"
-        )
-        print("\nRead raw data from: {}".format(data_file_raw))
-        data_raw = dd.read_csv(data_file_raw)
-        raw_data_nrows = len(data_raw)
-        #  n_partitions should be > 1 !
-        n_partitions = max(1, int(raw_data_nrows // 1e5))
-        data_raw = data_raw.repartition(npartitions=n_partitions)
-        data_raw = data_raw.reset_index().set_index("index")
-        data_raw["idx"] = 1
-        data_raw["idx"] = data_raw.idx.cumsum()
-        data_raw = data_raw.set_index("idx", sorted=True)
-        print("Raw data head:\n", data_raw.head())
-
-        # [DERIVED] Read CSV files into Pandas dataframes
-        data_file_derived = os.path.join(
-            data_path, "training_data_" + algorithm + ".csv"
-        )
-        print("\nRead derived data from: {}".format(data_file_derived))
-        data_derived = dd.read_csv(data_file_derived)
-        derived_data_nrows = len(data_derived)
-        data_derived = data_derived.repartition(npartitions=n_partitions)
-        data_derived = data_derived.reset_index().set_index("index")
-        data_derived["idx"] = 1
-        data_derived["idx"] = data_derived.idx.cumsum()
-        data_derived = data_derived.set_index("idx", sorted=True)
-        print("Derived data head:\n", data_derived.head())
-
-        # Merge raw/derived data together
-        print("Merging raw and derived ...")
-        data = dd.merge(data_raw, data_derived, left_index=True, right_index=True)
-
-        len_data, len_data_raw, len_data_derived = (
-            len(data),
-            raw_data_nrows,
-            derived_data_nrows,
-        )
-        nrows_message_temp = """
-        Data 1     : {:15,},
-        Data 2     : {:15,},
-        Merged data: {:15,}"""
-        nrows_message = nrows_message_temp.format(
-            len_data_raw, len_data_derived, len_data
-        )
-        assert len_data == len_data_raw, "Mismatch in number of rows\n" + nrows_message
-        assert len_data == len_data_derived, (
-            "Mismatch in number of rows\n" + nrows_message
-        )
-
-        # Add "mnk" column
-        data["mnk"] = (
-            data["m"].astype(str)
-            + "x"
-            + data["n"].astype(str)
-            + "x"
-            + data["k"].astype(str)
-        )
-
-        # Print info on merged dataset
-        print("\nMerged data head:", data.head())
-        data_nrows = len(data)
-        nrows_message = """
-Data        : {:15,},
-Raw data    : {:15,},
-Derived data: {:15,}""".format(
-            data_nrows, raw_data_nrows, derived_data_nrows
-        )
-        assert data_nrows == raw_data_nrows, (
-            "Mismatch in number of rows\n" + nrows_message
-        )
-        assert data_nrows == derived_data_nrows, (
-            "Mismatch in number of rows\n" + nrows_message
-        )
-        print(nrows_message)
-
-        # Compress files to Parquet
-        print("Compress and write to {}".format(parquet_file))
-        data.to_parquet(parquet_file, engine="fastparquet", compression="snappy")
-        open(
-            parquet_file_done, "w"
-        ).close()  # touch a file to mark that parquet is done
-
-
-# ===============================================================================
-def get_non_null(nlist):
-    """
-    Given a list of numbers, return its first positive element, if it exists, zero otherwise.
-    """
-    for e in nlist:
-        if e > 0:
-            return e
-    return 0
-
-
-def get_max(nlist):
-    """
-    Return the largest element of a list of numbers
-    """
-    return np.array(nlist).max()
-
-
-def list_of_dics_to_dic_of_lists(list_of_dics):
-    """
-    Given a list "list_of_dics" of dictionaries "d", with keys "k" and values "v",
-    construct a dictionary with keys "k" and values which are lists "[v1, v2, ...]"
-    of the values corresponding to "k" in the various dictionaries "d"
-    """
-    dic_of_lists = dict()
-    for dic in list_of_dics:
-        for k, v in dic.items():
-            if k not in dic_of_lists.keys():
-                dic_of_lists[k] = list()
-            dic_of_lists[k].append(v)
-    return dic_of_lists
-
-
-def dic_of_dics_to_dic_of_lists(dic_of_dics):
-    dic_of_lists = dict()
-    for _, dic in dic_of_dics.items():
-        for k, v in dic.items():
-            if k not in dic_of_lists.keys():
-                dic_of_lists[k] = list()
-            dic_of_lists[k].append(v)
-    return dic_of_lists
-
-
-def write_baseline_and_max_records_per_algorithm(
-    data_path, algorithm, arch, n_jobs, chunk_size
-):
-    """
-    Write records of baseline performances and maximum performances for the training mnks.
-    This function reads from the raw data file (`raw_training_data_ALGORITHM.csv`)
-    Writes to JSON files.
-    """
-    # Read GPU properties and autotuning properties
-    with open("../kernels/gpu_properties.json") as f:
-        gpu_properties = json.load(f)[arch]
-    with open("../kernels/autotuning_properties.json") as f:
-        autotuning_properties = json.load(f)
-
-    # Check whether record of baseline exists
-    baseline_performances_per_algo_file = os.path.join(
-        data_path, "baseline_performances_" + algorithm + ".json"
-    )
-    max_performances_per_algo_file = os.path.join(
-        data_path, "max_performances_" + algorithm + ".json"
-    )
-    print(
-        "\n\n------------------------------------------------------------------------"
-    )
-    if os.path.exists(baseline_performances_per_algo_file) and os.path.exists(
-        max_performances_per_algo_file
-    ):
-        print("Found {:40}, skipping".format(baseline_performances_per_algo_file))
-        print("Found {:40}, skipping".format(max_performances_per_algo_file))
-
-    else:
-        print("Processing data of algorithm {}".format(algorithm))
-        raw_pars_cols = kernel_algorithm[algorithm].launch_parameters
-        if algorithm in ["largeDB1", "largeDB2"]:
-            raw_pars_cols.remove("grouping")
-
-        data_file_raw = os.path.join(
-            data_path, "raw_training_data_" + algorithm + ".csv"
-        )
-        baseline_and_maximums_performance_dictionaries = Parallel(
-            n_jobs=n_jobs, verbose=1
-        )(
-            delayed(process_chunk, check_pickle=True)(
-                data_chunk, algorithm, gpu_properties, autotuning_properties
-            )
-            for data_chunk in tqdm(
-                pd.read_csv(data_file_raw, chunksize=chunk_size), disable=True
-            )
-        )
-
-        baseline_performance_dictionaries, maximums_performance_dictionaries = zip(
-            *baseline_and_maximums_performance_dictionaries
-        )
-        baseline_performance_dictionary = list_of_dics_to_dic_of_lists(
-            baseline_performance_dictionaries
-        )
-        assert (
-            0 not in baseline_performance_dictionary.values()
-        ), "Found a max. performance of 0"
-        maximums_performance_dictionary = list_of_dics_to_dic_of_lists(
-            maximums_performance_dictionaries
-        )
-        assert (
-            0 not in maximums_performance_dictionary.values()
-        ), "Found a baseline performance of 0"
-
-        # Write max performances to files
-        max_performances = dict()
-        print("\nComputing maximum performances ...")
-        for mnk, max_list in maximums_performance_dictionary.items():
-            perf = get_max(max_list)
-            max_performances[mnk] = perf
-        with open(max_performances_per_algo_file, "w") as f:
-            json.dump(max_performances, f, indent="\t", sort_keys=True)
-        print("Wrote maximum performances to:\n", max_performances_per_algo_file)
-
-        # Write baseline performances to files
-        baseline_performances = dict()
-
-        def get_baseline_performance(mnk, base_list, raw_pars_cols):
-            perf = get_non_null(base_list)
-            if perf == 0:
-                data_file = os.path.join(
-                    data_path, "raw_training_data_" + algorithm + ".csv"
-                )
-                data = dd.read_csv(data_file)
-                perf = get_performance_closest_to_baseline(
-                    data, algorithm, mnk, gpu_properties, autotuning_properties
-                )
-            return perf
-
-        print("\nComputing baseline performances ...")
-        baseline_performances_ = Parallel(n_jobs=n_jobs, verbose=1)(
-            delayed(get_baseline_performance, check_pickle=True)(
-                mnk, base_list, raw_pars_cols
-            )
-            for mnk, base_list in tqdm(
-                baseline_performance_dictionary.items(), disable=True
-            )
-        )
-
-        baseline_performances = dict(
-            zip(baseline_performance_dictionary.keys(), baseline_performances_)
-        )
-        with open(baseline_performances_per_algo_file, "w") as f:
-            json.dump(baseline_performances, f, indent="\t", sort_keys=True)
-        print("Wrote baseline performances to:\n", baseline_performances_per_algo_file)
-
-
-# ===============================================================================
-def plot_baseline(baseline_perfs_by_algo, data_path, algorithms):
-    import re
-    import matplotlib.pyplot as plt
-
-    print("\nPlotting baseline performances ...")
-
-    # Get all mnks
-    mnk_sequences = list()
-    for _algo, baseline_dic in baseline_perfs_by_algo.items():
-        mnk_sequences += list(baseline_dic.keys())
-    all_mnks = list(set.union(set(mnk_sequences)))
-
-    # Reduce baseline_perfs_by_algo to baseline_perfs
-    baseline_perfs = dict()
-    for mnk in all_mnks:
-        for algo in [
-            "medium",
-            "small",
-            "largeDB1",
-            "largeDB2",
-            "tiny",
-        ]:  # algorithms in order of baseline-ness
-            if mnk in baseline_perfs_by_algo[algo].keys():
-                baseline_perfs[mnk] = baseline_perfs_by_algo[algo][mnk]
-                break
-        else:
-            raise AssertionError(
-                "NOOOO this is actually impossible by def of all_mnks, isn't it?"
-            )
-
-    # Sort
-    mnks = list()
-    mnk_str = re.compile(r"(\d+)x(\d+)x(\d+)")
-    for mnk_s in baseline_perfs.keys():
-        match = mnk_str.match(mnk_s)
-        mnks.append((int(match.group(1)), int(match.group(2)), int(match.group(3))))
-
-    baseline_performances = zip(mnks, baseline_perfs.values())
-
-    baseline_performances_sorted = [
-        (mnk[0] * mnk[1] * mnk[2], p)
-        for mnk, p in sorted(
-            baseline_performances, key=lambda x: x[0][0] * x[0][1] * x[0][2]
-        )
-    ]
-    mnk_sorted, baseline_perf_sorted = list(zip(*baseline_performances_sorted))
-
-    # Plot
-    plt.plot(mnk_sorted, baseline_perf_sorted, ".", markersize=1)
-    plt.xlabel("(m, n, k) triplets of training data (in order of increasing m*n*k)")
-    plt.ylabel("Baseline performances (Gflop/s)")
-    plt.title("Baseline performances on training data")
-    algorithm_extension = "_" + algorithms[0] if len(algorithms) == 0 else ""
-    file_name = os.path.join(
-        data_path, "baseline_performances" + algorithm_extension + ".svg"
-    )
-    plt.savefig(file_name)
-    print("... wrote to", file_name)
-    plt.close()
-
-
-def write_baseline_record(data_path, algorithms):
-    baseline_performances_by_algo_file = os.path.join(
-        data_path, "baseline_performances_by_algo.json"
-    )
-    if os.path.exists(baseline_performances_by_algo_file):
-        print("Found {:40}, skipping".format(baseline_performances_by_algo_file))
-        with open(baseline_performances_by_algo_file) as f:
-            baseline_performances_by_algo = json.load(f)
-
-    else:
-        print(
-            "File {:40} not found, generating".format(
-                baseline_performances_by_algo_file
-            )
-        )
-        # Get baseline performances by algorithm
-        baseline_performances_by_algo = dict()
-        for algorithm in algorithms:
-            # Read baseline parameters
-            baseline_performances_per_algo_file = os.path.join(
-                data_path, "baseline_performances_" + algorithm + ".json"
-            )
-            with open(baseline_performances_per_algo_file, "r") as f:
-                baseline_algorithm = json.load(f)
-            # Add to dictionary
-            baseline_performances_by_algo[algorithm] = baseline_algorithm
-
-        # Write to file
-        with open(baseline_performances_by_algo_file, "w") as f:
-            json.dump(baseline_performances_by_algo, f, indent="\t", sort_keys=True)
-        print("\nWrote baseline performances to:\n", baseline_performances_by_algo_file)
-
-    plot_baseline(baseline_performances_by_algo, data_path, algorithms)
-
-
-def write_max_by_algo_record(data_path, algorithms):
-    max_performances_by_algo_file = os.path.join(
-        data_path, "max_performances_by_algo.json"
-    )
-    if os.path.exists(max_performances_by_algo_file):
-        print("Found {:40}, skipping".format(max_performances_by_algo_file))
-
-    else:
-        # Get max performances by algorithm
-        max_performances_by_algo = dict()
-        for algorithm in algorithms:
-            # Read max parameters
-            max_performances_per_algo_file = os.path.join(
-                data_path, "max_performances_" + algorithm + ".json"
-            )
-            with open(max_performances_per_algo_file, "r") as f:
-                max_algorithm = json.load(f)
-            # Add to dictionary
-            max_performances_by_algo[algorithm] = max_algorithm
-
-        # Write to file
-        with open(max_performances_by_algo_file, "w") as f:
-            json.dump(max_performances_by_algo, f, indent="\t", sort_keys=True)
-        print(
-            "\nWrote max performances by algorithm to:\n", max_performances_by_algo_file
-        )
-
-
-def plot_max_performances(max_perfs, data_path, algorithms):
-    import re
-    import matplotlib.pyplot as plt
-
-    print("\nPlotting max. performances ...")
-
-    mnks = list()
-    mnk_str = re.compile(r"(\d+)x(\d+)x(\d+)")
-    for mnk_s in max_perfs.keys():
-        match = mnk_str.match(mnk_s)
-        mnks.append((int(match.group(1)), int(match.group(2)), int(match.group(3))))
-
-    max_performances = zip(mnks, max_perfs.values())
-    max_performances_sorted = [
-        (mnk[0] * mnk[1] * mnk[2], p)
-        for mnk, p in sorted(
-            max_performances, key=lambda x: x[0][0] * x[0][1] * x[0][2]
-        )
-    ]
-    mnk_sorted, max_perf_sorted = list(zip(*max_performances_sorted))
-
-    # Plot
-    plt.plot(mnk_sorted, max_performances_sorted, ".", markersize=1)
-    plt.xlabel("(m, n, k) triplets of training data (in order of increasing m*n*k)")
-    plt.ylabel("Max. performances (Gflop/s)")
-    plt.title("Maximum performances on training data")
-    algorithm_extension = "_" + algorithms[0] if len(algorithms) == 0 else ""
-    file_name = os.path.join(
-        data_path, "max_performances" + algorithm_extension + ".svg"
-    )
-    plt.savefig(file_name)
-    print("... wrote to", file_name)
-    plt.close()
-
-
-def write_max_record(data_path, algorithms):
-    max_performances_file = os.path.join(data_path, "max_performances.json")
-    if os.path.exists(max_performances_file):
-        print("Found {:40}, skipping".format(max_performances_file))
-        with open(max_performances_file) as f:
-            max_performances = json.load(f)
-
-    else:
-        # Get max performances
-        max_performances_by_algo = dict()
-        for algorithm in algorithms:
-            # Read max parameters
-            max_performances_per_algo_file = os.path.join(
-                data_path, "max_performances_" + algorithm + ".json"
-            )
-            with open(max_performances_per_algo_file, "r") as f:
-                max_algorithm = json.load(f)
-            # Add to dictionary
-            max_performances_by_algo[algorithm] = max_algorithm
-
-        # Reduce along max
-        max_performances_list = dic_of_dics_to_dic_of_lists(max_performances_by_algo)
-        max_performances = dict()
-        for mnk, max_list in max_performances_list.items():
-            max_performances[mnk] = get_max(max_list)
-
-        # Write to file
-        with open(max_performances_file, "w") as f:
-            json.dump(max_performances, f, indent="\t", sort_keys=True)
-        print("\nWrote max performances to:\n", max_performances_file)
-
-    plot_max_performances(max_performances, data_path, algorithms)
-
-
-def get_derived_pars(
-    data_path,
-    i,
-    data_chunk,
-    algorithm,
-    gpu_properties,
-    autotuning_properties,
-    max_performances,
-):
-    # Compute derived parameters
-    data_chunk["algorithm"] = [algorithm] * len(
-        data_chunk.index
-    )  # add 'algorithm' column manually
-    parameter_sets = PredictiveParameters(
-        data_chunk, gpu_properties, autotuning_properties, max_performances
-    )
-    pars_to_get = derived_parameters["common"] + derived_parameters[algorithm]
-    new_data = parameter_sets.get_features(pars_to_get)
-
-    # Write to CSV
-    filename = os.path.join(data_path, "training_data_{}-{}.csv".format(algorithm, i))
-    new_data.to_csv(filename, index=False)
-
-    return filename
-
-
-def write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size):
-    """
-    The predictive modelling procedure uses not only the raw parameters as features, but also some
-    "derived" features computed using algorithm characteristics and hardware knowledge.
-    This function reads raw parameters from `data_path`, computes derived parameters and writes them
-    to the same folder.
-    """
-    derived_training_data_filename = os.path.join(
-        data_path, "training_data_{}.csv".format(algorithm)
-    )
-    print(
-        "\n\n------------------------------------------------------------------------"
-    )
-    if os.path.exists(derived_training_data_filename):
-        print("Found {:40}, skipping".format(derived_training_data_filename))
-
-    else:
-        print("Didn't find {:40}, generating".format(derived_training_data_filename))
-
-        # Read max performances, GPU properties and autotuning properties
-        maxperf_file = os.path.join(data_path, "max_performances.json")
-        with open(maxperf_file) as f:
-            max_performances = json.load(f)
-        with open("../kernels/gpu_properties.json") as f:
-            gpu_properties = json.load(f)[arch]
-        with open("../kernels/autotuning_properties.json") as f:
-            autotuning_properties = json.load(f)
-
-        # Compute derived data from raw data
-        raw_training_data_filename = os.path.join(
-            data_path, "raw_training_data_{}.csv".format(algorithm)
-        )
-        print(
-            "reading raw data from {} and computing derived parameters".format(
-                raw_training_data_filename
-            )
-        )
-
-        derived_training_data_filenames = Parallel(n_jobs=n_jobs, verbose=1)(
-            delayed(get_derived_pars, check_pickle=True)(
-                data_path,
-                i,
-                data_chunk,
-                algorithm,
-                gpu_properties,
-                autotuning_properties,
-                max_performances,
-            )
-            for i, data_chunk in enumerate(
-                pd.read_csv(raw_training_data_filename, chunksize=chunk_size)
-            )
-        )
-
-        # Merge the CSV files (one for each iteration of the above Joblib loop) into one file
-        assert len(derived_training_data_filenames) > 0, "No training data files"
-        if len(derived_training_data_filenames) == 1:
-            # No merging is necessary. Simply rename the file
-            os.rename(
-                derived_training_data_filenames[0], derived_training_data_filename
-            )
-
-        else:
-            with open(derived_training_data_filename, "w") as out:
-                # Write the first file, including its header
-                fn_1 = derived_training_data_filenames.pop(0)
-                with open(fn_1) as f:
-                    out.write(f.read())
-                os.remove(fn_1)
-                # Write the rest of the files, skipping the header line each time
-                for i, fn in enumerate(derived_training_data_filenames):
-                    print(
-                        "writing from {} ({}/{})".format(
-                            fn, i + 1, len(derived_training_data_filenames)
-                        )
-                    )
-                    with open(fn) as f:
-                        next(f)  # skip header line
-                        out.write(f.read())
-                    # Delete the file we just merged
-                    os.remove(fn)
-
-        print("\tWrote", derived_training_data_filename)
-
-
-# ===============================================================================
-def main(data_path, algorithms_to_prep, arch, n_jobs, chunk_size, skip_derived_data):
-    # ===============================================================================
-    # Write baseline and maximum performance records
-    for algorithm in algorithms_to_prep:
-        write_baseline_and_max_records_per_algorithm(
-            data_path, algorithm, arch, n_jobs, chunk_size
-        )
-
-    if set(algorithms_to_prep) == set(kernel_algorithm.keys()):
-        write_baseline_record(data_path, algorithms_to_prep)
-        write_max_by_algo_record(data_path, algorithms_to_prep)
-        write_max_record(data_path, algorithms_to_prep)
-
-    # ===============================================================================
-    if not skip_derived_data:
-        for algorithm in algorithms_to_prep:
-            write_derived_data(data_path, algorithm, arch, n_jobs, chunk_size)
-            write_to_parquet(data_path, algorithm)
-
-
-# ===============================================================================
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="""
-        Prepare the data collected with autotuning for training,
-        After downloading raw data from the dedicated repository, use this script to
-        - Record maximum and baseline performances of (m,n,k)-triplets in JSON files
-        - Compute derived training data and write it to a CSV file
-        - Compress training data csv files to parquet file format
-
-
-        This script is part of the workflow for predictive modelling of optimal libcusmm parameters.
-        For more details, see predict.md
-        """,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "-f",
-        "--folder",
-        metavar="FOLDER",
-        type=str,
-        default=".",
-        help="Path to the data to be converted to parquet.",
-    )
-    parser.add_argument(
-        "-l",
-        "--algorithm",
-        metavar="ALGORITHM",
-        default="",
-        help="Algorithms to prepare",
-    )
-    parser.add_argument(
-        "-a",
-        "--arch",
-        metavar="ARCHITECTURE",
-        type=str,
-        default="sm_80",
-        help="CUDA architecture number. Options: sm_35, sm_37, sm_60, sm_70, sm_80, gfx906",
-    )
-    parser.add_argument(
-        "-j",
-        "--njobs",
-        default=-1,
-        metavar="NUMBER",
-        type=int,
-        help="Number of parallel jobs that Joblib will launch. If you run into out-of-memory errors, reduce this.",
-    )
-    parser.add_argument(
-        "-c",
-        "--chunk_size",
-        type=int,
-        default=20000,
-        help="Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number",
-    )
-    parser.add_argument(
-        "-s",
-        "--skip_derived_data",
-        type=bool,
-        default=False,
-        help=(
-            "Skip the computation of derived data. Set to true if computing baseline & max records for "
-            "each algorithm separately"
-        ),
-    )
-
-    args = parser.parse_args()
-    algorithms_to_prep = (
-        kernel_algorithm.keys() if args.algorithm == "" else [args.algorithm]
-    )
-    main(
-        args.folder,
-        algorithms_to_prep,
-        args.arch,
-        args.njobs,
-        args.chunk_size,
-        args.skip_derived_data,
-    )
diff --git a/src/acc/libsmm_acc/predict/requirements.txt b/src/acc/libsmm_acc/predict/requirements.txt
deleted file mode 100644
index a9187ccbc03..00000000000
--- a/src/acc/libsmm_acc/predict/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-eli5>=0.8.1
-joblib>=0.13.1, <=0.17.0
-tqdm>=4.32.2
-matplotlib>=3.0.2
-numpy>=1.16.0
-pandas>=0.23.4
-scikit-learn>=0.20.2
-dask[dataframe]>=2021.10.0
-xgboost>=0.90
-fastparquet>=0.3.1
-python-snappy>=0.5.4
diff --git a/src/acc/libsmm_acc/tune/README.md b/src/acc/libsmm_acc/tune/README.md
index 01b00710f83..96c8571e12e 100644
--- a/src/acc/libsmm_acc/tune/README.md
+++ b/src/acc/libsmm_acc/tune/README.md
@@ -65,7 +65,6 @@ The `tune_setup.py` script generates job files. You have to adapt the script to
     output += "date\n"
 
     ...
-
 ...
 ```
 
@@ -235,11 +234,7 @@ Wrote parameters.new.json
 
 The file `parameters.new.json` can now be used as a parameter file. Rename it to `parameters_GPU.json`, with the appropriate `GPU`.
 
-#### 8. (optional) Explore the data
-
-Explore the data interactively using the [provided Jupyter Notebook](https://github.com/cp2k/dbcsr/blob/develop/src/acc/libsmm_acc/notebooks/inspect_training_data.ipynb).
-
-#### 9. Contribute parameters to the community
+#### 8. Contribute parameters to the community
 
 **Contribute new optimal parameters**
 
diff --git a/src/acc/opencl/Makefile b/src/acc/opencl/Makefile
index 0cbefd97d69..244a7b2692f 100644
--- a/src/acc/opencl/Makefile
+++ b/src/acc/opencl/Makefile
@@ -73,23 +73,15 @@ ifneq (,$(ELEM_TYPE))
   CFLAGS += -DELEM_TYPE=$(ELEM_TYPE)
 endif
 
-ifeq (1,$(INTEL))
-  CXX := icpc
-  CC := icc
-  AR := xiar
-else ifneq (0,$(INTEL))
-  CXX := icpx
-  CC := icx
-  AR := xiar
-else ifneq (0,$(GNU))
-  override CXX := g++
-  override CC := gcc
-  ifneq (Darwin,$(UNAME))
-    override AR := gcc-ar
+ifneq (0,$(INTEL))
+  ifneq (1,$(INTEL))
+    CXX := icpx
+    CC := icx
   else
-    override AR := ar
+    CXX := icpc
+    CC := icc
   endif
-  #override LD_LIBRARY_DIRS := $(NULL)
+  AR := $(if $(call which,xiar),xiar,ar)
 else
   CXX := g++
   CC := gcc
@@ -109,13 +101,14 @@ ifneq (0,$(DEV))
     CFLAGS += -D__DBCSR_ACC
     CFLAGS += -Wno-deprecated -Werror
     ifneq (2,$(DEV))
+      $(info DEBUG: $(CC) $(CXX))
       ifneq (,$(findstring clang,$(CC) $(CXX)))
         override CC := clang++ --analyze
       else
-        CC := $(CXX) -xc++
+        override CC := $(CXX) -xc++
       endif
     else
-      CC := $(CXX) -xc++
+      override CC := $(CXX) -xc++
     endif
     $(info CC: $(shell $(CC) --version | head -n1))
     OMP := 0
diff --git a/src/acc/opencl/acc_opencl.c b/src/acc/opencl/acc_opencl.c
index 6ea4f1d1f83..d7ee7651184 100644
--- a/src/acc/opencl/acc_opencl.c
+++ b/src/acc/opencl/acc_opencl.c
@@ -217,8 +217,8 @@ int c_dbcsr_acc_init(void) {
     cl_platform_id platforms[ACC_OPENCL_MAXNDEVS] = {NULL};
     cl_device_id devices[ACC_OPENCL_MAXNDEVS];
     char buffer[ACC_OPENCL_BUFFERSIZE];
+    const char *const env_devsplit = getenv("ACC_OPENCL_DEVSPLIT"), *const env_priority = getenv("ACC_OPENCL_PRIORITY");
     const char *const env_devmatch = getenv("ACC_OPENCL_DEVMATCH"), *const env_devtype = getenv("ACC_OPENCL_DEVTYPE");
-    const char *const env_priority = getenv("ACC_OPENCL_PRIORITY"), *const env_xhints = getenv("ACC_OPENCL_XHINTS");
     const char *const env_verbose = getenv("ACC_OPENCL_VERBOSE"), *const env_debug = getenv("ACC_OPENCL_DEBUG");
     const char *const env_device = getenv("ACC_OPENCL_DEVICE"), *const env_dump_acc = getenv("ACC_OPENCL_DUMP");
     const char *const env_timer = getenv("ACC_OPENCL_TIMER"), *const env_nlocks = getenv("ACC_OPENCL_NLOCKS");
@@ -229,14 +229,20 @@ int c_dbcsr_acc_init(void) {
 #  endif
     const char *const env_neo = getenv("NEOReadDebugKeys"), *const env_wa = getenv("ACC_OPENCL_WA");
     const int neo = (NULL == env_neo ? 1 : atoi(env_neo));
+#  if defined(ACC_OPENCL_XHINTS)
+    const char* const env_xhints = (ACC_OPENCL_XHINTS);
+    const int xhints_default = 1 + 2 + 4 + 8;
+#  else
+    const char* const env_xhints = NULL;
+    const int xhints_default = 0;
+#  endif
 #  if defined(ACC_OPENCL_ASYNC)
     const char* const env_async = (ACC_OPENCL_ASYNC);
-    const int async_default = 3;
+    const int async_default = 1 + 2;
 #  else
     const char* const env_async = NULL;
     const int async_default = 0;
 #  endif
-    const char* const env_devsplit = getenv("ACC_OPENCL_DEVSPLIT");
     /*const char* const env_nranks = getenv("MPI_LOCALNRANKS");
     const cl_uint nranks = LIBXSMM_MAX(NULL != env_nranks ? atoi(env_nranks) : 1, 1);*/
     const cl_int devsplit = (NULL == env_devsplit ? /*(1 < nranks ? -1 : 0)*/ 0 : atoi(env_devsplit));
@@ -274,11 +280,11 @@ int c_dbcsr_acc_init(void) {
                                                   : c_dbcsr_acc_opencl_config.lock_main);
     c_dbcsr_acc_opencl_config.verbosity = (NULL == env_verbose ? 0 : atoi(env_verbose));
     c_dbcsr_acc_opencl_config.priority = (NULL == env_priority ? /*default*/ 3 : atoi(env_priority));
-    c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? (1 + 2) : atoi(env_xhints));
+    c_dbcsr_acc_opencl_config.xhints = (NULL == env_xhints ? xhints_default : atoi(env_xhints));
     c_dbcsr_acc_opencl_config.async = (NULL == env_async ? async_default : atoi(env_async));
     c_dbcsr_acc_opencl_config.dump = (NULL == env_dump ? /*default*/ 0 : atoi(env_dump));
     c_dbcsr_acc_opencl_config.debug = (NULL == env_debug ? c_dbcsr_acc_opencl_config.dump : atoi(env_debug));
-    c_dbcsr_acc_opencl_config.wa = neo * (NULL == env_wa ? ((1 != devsplit ? 0 : 4) + (8 + 16) + (32 + 64)) : atoi(env_wa));
+    c_dbcsr_acc_opencl_config.wa = neo * (NULL == env_wa ? ((1 != devsplit ? 0 : 4) + (8 + 16 + 32)) : atoi(env_wa));
     assert(EXIT_SUCCESS == result);
     if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(NULL /*device*/, env_devmatch, &c_dbcsr_acc_opencl_config.devmatch)) {
       c_dbcsr_acc_opencl_config.devmatch = 1;
@@ -338,18 +344,22 @@ int c_dbcsr_acc_init(void) {
 #  if defined(ACC_OPENCL_NCCS)
     if ((1 & c_dbcsr_acc_opencl_config.wa) && 0 != nccs && NULL == getenv("ZEX_NUMBER_OF_CCS")) {
       static char zex_nccs[ACC_OPENCL_MAXNDEVS * 8 + 32] = "ZEX_NUMBER_OF_CCS=";
+      const int mode = ((1 == nccs || 2 == nccs) ? nccs : 4);
       int j = strlen(zex_nccs);
       for (i = 0; i < ACC_OPENCL_MAXNDEVS; ++i) {
         const char* const istr = (0 < i ? ",%u:%i" : "%u:%i");
-        const int n = LIBXSMM_SNPRINTF(zex_nccs + j, sizeof(zex_nccs) - j, istr, i, LIBXSMM_CLMP(nccs, 1, 4));
+        const int n = LIBXSMM_SNPRINTF(zex_nccs + j, sizeof(zex_nccs) - j, istr, i, mode);
         if (0 < n) j += n;
         else {
           j = 0;
           break;
         }
       }
-      /* environment is populated before touching the compute runtime */
-      if (0 < j) ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(zex_nccs)); /* soft-error */
+      if (0 < j && 0 == LIBXSMM_PUTENV(zex_nccs) && /* populate before touching the compute runtime */
+          (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity))
+      {
+        fprintf(stderr, "INFO ACC/OpenCL: support multiple separate compute command streamers (%i-CCS mode)\n", mode);
+      }
     }
     assert(EXIT_SUCCESS == result);
 #  endif
@@ -373,9 +383,9 @@ int c_dbcsr_acc_init(void) {
     }
     assert(EXIT_SUCCESS == result);
 #  endif
-    if (~(1 + 2) & c_dbcsr_acc_opencl_config.wa) { /* environment is populated before touching the compute runtime */
-      static char* key_value[] = {"NEOReadDebugKeys=1", "ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE", "EnableRecoverablePageFaults=0",
-        "DirectSubmissionOverrideBlitterSupport=0"};
+    if (~(1 + 2 + 32) & c_dbcsr_acc_opencl_config.wa) { /* environment is populated before touching the compute runtime */
+      static char a[] = "NEOReadDebugKeys=1", b[] = "ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE", c[] = "EnableRecoverablePageFaults=0";
+      static char d[] = "DirectSubmissionOverrideBlitterSupport=0", *key_value[] = {a, b, c, d};
       if (NULL == env_neo) ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(key_value[0]));
       if ((4 & c_dbcsr_acc_opencl_config.wa) && NULL == getenv("ZE_FLAT_DEVICE_HIERARCHY")) {
         ACC_OPENCL_EXPECT(0 == LIBXSMM_PUTENV(key_value[1]));
@@ -537,7 +547,7 @@ int c_dbcsr_acc_init(void) {
                 c_dbcsr_acc_opencl_config.devices[0] = c_dbcsr_acc_opencl_config.devices[i];
               }
               c_dbcsr_acc_opencl_config.ndevices = 1;
-              device_id = (int)i;
+              device_id = 0;
               break;
             }
             else if (CL_DEVICE_TYPE_ALL == type && NULL == env_devtype /*&& CL_DEVICE_TYPE_GPU == itype*/ && device_id <= (int)i) {
@@ -641,6 +651,11 @@ int c_dbcsr_acc_init(void) {
             result = c_dbcsr_acc_opencl_set_active_device(NULL /*lock*/, ACC_OPENCL_ACTIVATE);
           }
           else {
+            const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
+            const int rank = (NULL != env_rank ? atoi(env_rank) : 0);
+            if (0 < rank && 1 < c_dbcsr_acc_opencl_config.ndevices) {
+              device_id = rank % c_dbcsr_acc_opencl_config.ndevices;
+            }
             result = c_dbcsr_acc_opencl_set_active_device(NULL /*lock*/, device_id);
           }
 #  else
@@ -843,6 +858,9 @@ int c_dbcsr_acc_opencl_device_name(
   cl_device_id device, char name[], size_t name_maxlen, char platform[], size_t platform_maxlen, int cleanup) {
   int result_name = 0, result_platform = 0;
   assert(NULL != name || NULL != platform);
+  if (NULL == device && 0 < c_dbcsr_acc_opencl_config.ndevices) {
+    device = c_dbcsr_acc_opencl_config.devices[0]; /* NULL-device refers to device 0 */
+  }
   if (NULL != name && 0 != name_maxlen) {
     result_name = clGetDeviceInfo(device, CL_DEVICE_NAME, name_maxlen, name, NULL);
     if (0 != cleanup && EXIT_SUCCESS == result_name) {
@@ -996,158 +1014,162 @@ int c_dbcsr_acc_opencl_create_context(cl_device_id active_id, cl_context* contex
 
 
 int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_id) {
-  /* accessing devices is thread-safe (array is fixed after initialization) */
-  const cl_device_id active_id =
-    ((0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) ? c_dbcsr_acc_opencl_config.devices[device_id] : NULL);
   int result = EXIT_SUCCESS;
   assert(c_dbcsr_acc_opencl_config.ndevices < ACC_OPENCL_MAXNDEVS);
-  if (NULL != active_id) {
-    cl_device_id context_id = NULL;
-    cl_context context = NULL;
-    if (NULL != lock) ACC_OPENCL_ACQUIRE(lock);
-    context = c_dbcsr_acc_opencl_config.device.context;
-    context_id = c_dbcsr_acc_opencl_config.device.id;
-    if (NULL != context) {
-      assert(NULL != context_id);
-      if (active_id != context_id) {
+  if (0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) {
+    /* accessing devices is thread-safe (array is fixed after initialization) */
+    const cl_device_id active_id = c_dbcsr_acc_opencl_config.devices[device_id];
+    if (NULL != active_id) {
+      cl_device_id context_id = NULL;
+      cl_context context = NULL;
+      if (NULL != lock) ACC_OPENCL_ACQUIRE(lock);
+      context = c_dbcsr_acc_opencl_config.device.context;
+      context_id = c_dbcsr_acc_opencl_config.device.id;
+      if (NULL != context) {
+        assert(NULL != context_id);
+        if (active_id != context_id) {
 #  if defined(CL_VERSION_1_2)
-        ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseDevice(context_id));
+          ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseDevice(context_id));
 #  endif
-        result = clReleaseContext(context);
-        context_id = NULL;
-        context = NULL;
+          result = clReleaseContext(context);
+          context_id = NULL;
+          context = NULL;
+        }
       }
-    }
-    assert(NULL == context_id || active_id == context_id);
-    if (EXIT_SUCCESS == result && active_id != context_id) {
-      result = c_dbcsr_acc_opencl_create_context(active_id, &context);
-      assert(NULL != context || EXIT_SUCCESS != result);
-    }
-    if (EXIT_SUCCESS == result && active_id != context_id) { /* update/cache device-specific information */
-      if (NULL != c_dbcsr_acc_opencl_config.device.stream.queue) { /* release private stream */
-        ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseCommandQueue(c_dbcsr_acc_opencl_config.device.stream.queue));
+      assert(NULL == context_id || active_id == context_id);
+      if (EXIT_SUCCESS == result && active_id != context_id) {
+        result = c_dbcsr_acc_opencl_create_context(active_id, &context);
+        assert(NULL != context || EXIT_SUCCESS != result);
       }
-      memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device));
-      result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device.std_clevel,
-        c_dbcsr_acc_opencl_config.device.std_level, c_dbcsr_acc_opencl_config.device.std_flag,
-        &c_dbcsr_acc_opencl_config.device.type);
-      if (EXIT_SUCCESS == result) {
-        char devname[ACC_OPENCL_BUFFERSIZE] = "";
-        const char* const sgexts[] = {"cl_intel_required_subgroup_size", "cl_intel_subgroups", "cl_khr_subgroups"};
-        size_t sgsizes[16], nbytes = 0, sgmin = (size_t)-1, i;
+      if (EXIT_SUCCESS == result && active_id != context_id) { /* update/cache device-specific information */
+        if (NULL != c_dbcsr_acc_opencl_config.device.stream.queue) { /* release private stream */
+          ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseCommandQueue(c_dbcsr_acc_opencl_config.device.stream.queue));
+        }
+        memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device));
+        result = c_dbcsr_acc_opencl_device_level(active_id, c_dbcsr_acc_opencl_config.device.std_clevel,
+          c_dbcsr_acc_opencl_config.device.std_level, c_dbcsr_acc_opencl_config.device.std_flag,
+          &c_dbcsr_acc_opencl_config.device.type);
+        if (EXIT_SUCCESS == result) {
+          char devname[ACC_OPENCL_BUFFERSIZE] = "";
+          const char* const sgexts[] = {"cl_intel_required_subgroup_size", "cl_intel_subgroups", "cl_khr_subgroups"};
+          size_t sgsizes[16], nbytes = 0, sgmin = (size_t)-1, i;
 #  if defined(ACC_OPENCL_CMDAGR)
-        ACC_OPENCL_STREAM_PROPERTIES_TYPE properties[4] = {
-          CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */
-        };
+          ACC_OPENCL_STREAM_PROPERTIES_TYPE properties[4] = {
+            CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0 /* terminator */
+          };
 #  endif
 #  if defined(ACC_OPENCL_MEM_DEVPTR)
-        cl_platform_id platform = NULL;
-        cl_bitfield bitfield = 0;
+          cl_platform_id platform = NULL;
+          cl_bitfield bitfield = 0;
 #  endif
-        c_dbcsr_acc_opencl_config.device.intel = (EXIT_SUCCESS ==
-                                                  c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/));
-        c_dbcsr_acc_opencl_config.device.nv = (EXIT_SUCCESS ==
-                                               c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/));
+          c_dbcsr_acc_opencl_config.device.intel = (EXIT_SUCCESS ==
+                                                    c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 0 /*use_platform_name*/));
+          c_dbcsr_acc_opencl_config.device.nv = (EXIT_SUCCESS ==
+                                                 c_dbcsr_acc_opencl_device_vendor(active_id, "nvidia", 0 /*use_platform_name*/));
 
-        if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(
-                              active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/, 0 /*platform_maxlen*/, /*cleanup*/ 1) ||
-            EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device.uid))
-        {
-          c_dbcsr_acc_opencl_config.device.uid = (cl_uint)-1;
-        }
-        if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) ||
-            EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/))
-        {
-          c_dbcsr_acc_opencl_config.device.amd = 1;
-          if ('\0' != *devname) {
-            const char* const gfxname = LIBXSMM_STRISTR(devname, "gfx");
-            if (NULL != gfxname && 90 <= atoi(gfxname + 3)) {
-              c_dbcsr_acc_opencl_config.device.amd = 2;
+          if (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_name(active_id, devname, ACC_OPENCL_BUFFERSIZE, NULL /*platform*/,
+                                0 /*platform_maxlen*/, /*cleanup*/ 1) ||
+              EXIT_SUCCESS != c_dbcsr_acc_opencl_device_uid(active_id, devname, &c_dbcsr_acc_opencl_config.device.uid))
+          {
+            c_dbcsr_acc_opencl_config.device.uid = (cl_uint)-1;
+          }
+          if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 0 /*use_platform_name*/) ||
+              EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "amd", 1 /*use_platform_name*/))
+          {
+            c_dbcsr_acc_opencl_config.device.amd = 1;
+            if ('\0' != *devname) {
+              const char* const gfxname = LIBXSMM_STRISTR(devname, "gfx");
+              if (NULL != gfxname && 90 <= atoi(gfxname + 3)) {
+                c_dbcsr_acc_opencl_config.device.amd = 2;
+              }
             }
           }
-        }
-        if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool) /*cl_int*/,
-                              &c_dbcsr_acc_opencl_config.device.unified, NULL))
-        {
-          c_dbcsr_acc_opencl_config.device.unified = CL_FALSE;
-        }
-        if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
-                              c_dbcsr_acc_opencl_config.device.wgsize, NULL))
-        {
-          c_dbcsr_acc_opencl_config.device.wgsize[0] = 1;
-        }
-        if (EXIT_SUCCESS != clGetDeviceInfo(active_id, 4199 /*CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE*/, sizeof(size_t),
-                              c_dbcsr_acc_opencl_config.device.wgsize + 1, NULL)) /* CL_VERSION_3_0 */
-        {
-          c_dbcsr_acc_opencl_config.device.wgsize[1] = 1;
-        }
-        assert(0 == c_dbcsr_acc_opencl_config.device.wgsize[2]);
-        if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_id, sgexts, 2) &&
-            EXIT_SUCCESS ==
-              clGetDeviceInfo(active_id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, sizeof(sgsizes), sgsizes, &nbytes))
-        {
-          for (i = 0; (i * sizeof(size_t)) < nbytes; ++i) {
-            const size_t sgsize = sgsizes[i];
-            if (sgsize < sgmin) sgmin = sgsize;
-            if (0 == (sgsize % c_dbcsr_acc_opencl_config.device.wgsize[1]) && c_dbcsr_acc_opencl_config.device.wgsize[2] < sgsize) {
-              if (c_dbcsr_acc_opencl_config.device.wgsize[1] < sgsize) c_dbcsr_acc_opencl_config.device.wgsize[1] = sgsize;
-              c_dbcsr_acc_opencl_config.device.wgsize[2] = sgsize;
+          if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_bool) /*cl_int*/,
+                                &c_dbcsr_acc_opencl_config.device.unified, NULL))
+          {
+            c_dbcsr_acc_opencl_config.device.unified = CL_FALSE;
+          }
+          if (EXIT_SUCCESS != clGetDeviceInfo(active_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),
+                                c_dbcsr_acc_opencl_config.device.wgsize, NULL))
+          {
+            c_dbcsr_acc_opencl_config.device.wgsize[0] = 1;
+          }
+          if (EXIT_SUCCESS != clGetDeviceInfo(active_id, 4199 /*CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE*/, sizeof(size_t),
+                                c_dbcsr_acc_opencl_config.device.wgsize + 1, NULL)) /* CL_VERSION_3_0 */
+          {
+            c_dbcsr_acc_opencl_config.device.wgsize[1] = 1;
+          }
+          assert(0 == c_dbcsr_acc_opencl_config.device.wgsize[2]);
+          if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_id, sgexts, 2) &&
+              EXIT_SUCCESS ==
+                clGetDeviceInfo(active_id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, sizeof(sgsizes), sgsizes, &nbytes))
+          {
+            for (i = 0; (i * sizeof(size_t)) < nbytes; ++i) {
+              const size_t sgsize = sgsizes[i];
+              if (sgsize < sgmin) sgmin = sgsize;
+              if (0 != c_dbcsr_acc_opencl_config.device.wgsize[1] && 0 == (sgsize % c_dbcsr_acc_opencl_config.device.wgsize[1]) &&
+                  c_dbcsr_acc_opencl_config.device.wgsize[2] < sgsize)
+              {
+                if (c_dbcsr_acc_opencl_config.device.wgsize[1] < sgsize) c_dbcsr_acc_opencl_config.device.wgsize[1] = sgsize;
+                c_dbcsr_acc_opencl_config.device.wgsize[2] = sgsize;
+              }
             }
+            if (0 != c_dbcsr_acc_opencl_config.device.wgsize[2]) c_dbcsr_acc_opencl_config.device.wgsize[2] = sgmin;
+          }
+          else {
+            c_dbcsr_acc_opencl_config.device.wgsize[2] = 0;
+          }
+#  if defined(ACC_OPENCL_XHINTS) && defined(ACC_OPENCL_MEM_DEVPTR)
+          if (0 != (1 & c_dbcsr_acc_opencl_config.xhints) && 2 <= *c_dbcsr_acc_opencl_config.device.std_level &&
+              0 != c_dbcsr_acc_opencl_config.device.intel && 0 == c_dbcsr_acc_opencl_config.device.unified &&
+              EXIT_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL) &&
+              EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 2 /*platform vendor*/) &&
+              EXIT_SUCCESS == clGetDeviceInfo(active_id, 0x4191 /*CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL*/, sizeof(cl_bitfield),
+                                &bitfield, NULL) &&
+              0 != bitfield) /* cl_intel_unified_shared_memory extension */
+          {
+            void* ptr = NULL;
+            ptr = clGetExtensionFunctionAddressForPlatform(platform, "clSetKernelArgMemPointerINTEL");
+            LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clSetKernelArgMemPointerINTEL, &ptr);
+            ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL");
+            LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL, &ptr);
+            ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");
+            LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL, &ptr);
+            ptr = clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL");
+            LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL, &ptr);
+            ptr = clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
+            LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clMemFreeINTEL, &ptr);
           }
-          if (0 != c_dbcsr_acc_opencl_config.device.wgsize[2]) c_dbcsr_acc_opencl_config.device.wgsize[2] = sgmin;
-        }
-        else {
-          c_dbcsr_acc_opencl_config.device.wgsize[2] = 0;
-        }
-#  if defined(ACC_OPENCL_MEM_DEVPTR)
-        if (0 != (1 & c_dbcsr_acc_opencl_config.xhints) && 2 <= *c_dbcsr_acc_opencl_config.device.std_level &&
-            0 != c_dbcsr_acc_opencl_config.device.intel && 0 == c_dbcsr_acc_opencl_config.device.unified &&
-            EXIT_SUCCESS == clGetDeviceInfo(active_id, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL) &&
-            EXIT_SUCCESS == c_dbcsr_acc_opencl_device_vendor(active_id, "intel", 2 /*platform vendor*/) &&
-            EXIT_SUCCESS == clGetDeviceInfo(active_id, 0x4191 /*CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL*/, sizeof(cl_bitfield),
-                              &bitfield, NULL) &&
-            0 != bitfield) /* cl_intel_unified_shared_memory extension */
-        {
-          void* ptr = NULL;
-          ptr = clGetExtensionFunctionAddressForPlatform(platform, "clSetKernelArgMemPointerINTEL");
-          LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clSetKernelArgMemPointerINTEL, &ptr);
-          ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemFillINTEL");
-          LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL, &ptr);
-          ptr = clGetExtensionFunctionAddressForPlatform(platform, "clEnqueueMemcpyINTEL");
-          LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL, &ptr);
-          ptr = clGetExtensionFunctionAddressForPlatform(platform, "clDeviceMemAllocINTEL");
-          LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL, &ptr);
-          ptr = clGetExtensionFunctionAddressForPlatform(platform, "clMemFreeINTEL");
-          LIBXSMM_ASSIGN127(&c_dbcsr_acc_opencl_config.device.clMemFreeINTEL, &ptr);
-        }
 #  endif
 #  if defined(ACC_OPENCL_CMDAGR)
-        if (0 != c_dbcsr_acc_opencl_config.device.intel) { /* device vendor (above) can now be used */
-          int result_cmdagr = EXIT_SUCCESS;
-          const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result_cmdagr);
-          if (EXIT_SUCCESS == result_cmdagr) {
+          if (0 != c_dbcsr_acc_opencl_config.device.intel) { /* device vendor (above) can now be used */
+            int result_cmdagr = EXIT_SUCCESS;
+            const cl_command_queue q = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result_cmdagr);
+            if (EXIT_SUCCESS == result_cmdagr) {
 #    if 0 /* force host-timer? */
-            c_dbcsr_acc_opencl_config.timer = c_dbcsr_acc_opencl_timer_host;
+              c_dbcsr_acc_opencl_config.timer = c_dbcsr_acc_opencl_timer_host;
 #    endif
-            assert(NULL != q);
-            clReleaseCommandQueue(q);
+              assert(NULL != q);
+              clReleaseCommandQueue(q);
+            }
           }
-        }
 #  endif
-        properties[1] = 0;
-        c_dbcsr_acc_opencl_config.device.stream.queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result);
-      }
-      if (EXIT_SUCCESS == result) {
-        if (active_id != context_id) {
-          assert(active_id != c_dbcsr_acc_opencl_config.device.id);
-          c_dbcsr_acc_opencl_config.device.context = context;
-          c_dbcsr_acc_opencl_config.device.id = active_id;
+          properties[1] = 0;
+          c_dbcsr_acc_opencl_config.device.stream.queue = ACC_OPENCL_CREATE_COMMAND_QUEUE(context, active_id, properties, &result);
+        }
+        if (EXIT_SUCCESS == result) {
+          if (active_id != context_id) {
+            assert(active_id != c_dbcsr_acc_opencl_config.device.id);
+            c_dbcsr_acc_opencl_config.device.context = context;
+            c_dbcsr_acc_opencl_config.device.id = active_id;
+          }
+          assert(active_id == c_dbcsr_acc_opencl_config.device.id);
         }
-        assert(active_id == c_dbcsr_acc_opencl_config.device.id);
+        else memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device));
       }
-      else memset(&c_dbcsr_acc_opencl_config.device, 0, sizeof(c_dbcsr_acc_opencl_config.device));
+      if (NULL != lock) ACC_OPENCL_RELEASE(lock);
     }
-    if (NULL != lock) ACC_OPENCL_RELEASE(lock);
+    else result = EXIT_FAILURE;
   }
   else result = EXIT_FAILURE;
   assert(EXIT_SUCCESS == result || NULL == c_dbcsr_acc_opencl_config.device.context);
@@ -1156,13 +1178,8 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i
 
 
 int c_dbcsr_acc_set_active_device(int device_id) {
+  /* avoid ACC_OPENCL_PROFILE in this routine */
   int result = EXIT_SUCCESS;
-#  if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
-  int routine_handle;
-  static const char* const routine_name_ptr = LIBXSMM_FUNCNAME;
-  static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1;
-  c_dbcsr_timeset((const char**)&routine_name_ptr, &routine_name_len, &routine_handle);
-#  endif
   if (0 <= device_id && device_id < c_dbcsr_acc_opencl_config.ndevices) {
 #  if defined(ACC_OPENCL_CACHE_DID)
     if (c_dbcsr_acc_opencl_active_id != (device_id + 1))
@@ -1174,12 +1191,7 @@ int c_dbcsr_acc_set_active_device(int device_id) {
 #  endif
     }
   }
-#  if !defined(NDEBUG)
   else result = EXIT_FAILURE;
-#  endif
-#  if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
-  c_dbcsr_timestop(&routine_handle);
-#  endif
   ACC_OPENCL_RETURN(result);
 }
 
diff --git a/src/acc/opencl/acc_opencl.h b/src/acc/opencl/acc_opencl.h
index cd6639983fd..ddd1834051f 100644
--- a/src/acc/opencl/acc_opencl.h
+++ b/src/acc/opencl/acc_opencl.h
@@ -104,6 +104,9 @@
 #if !defined(ACC_OPENCL_ASYNC) && 1
 #  define ACC_OPENCL_ASYNC getenv("ACC_OPENCL_ASYNC")
 #endif
+#if !defined(ACC_OPENCL_XHINTS) && 1
+#  define ACC_OPENCL_XHINTS getenv("ACC_OPENCL_XHINTS")
+#endif
 #if !defined(ACC_OPENCL_STREAM_PRIORITIES) && 0
 #  if defined(CL_QUEUE_PRIORITY_KHR)
 #    define ACC_OPENCL_STREAM_PRIORITIES
@@ -121,7 +124,7 @@
 #  define ACC_OPENCL_ACTIVATE 0
 #endif
 /* Use DBCSR's profile for detailed timings */
-#if !defined(ACC_OPENCL_PROFILE) && 0
+#if !defined(ACC_OPENCL_PROFILE) && (defined(__OFFLOAD_PROFILING) || 0)
 #  define ACC_OPENCL_PROFILE
 #endif
 
@@ -359,7 +362,7 @@ typedef struct c_dbcsr_acc_opencl_config_t {
 extern c_dbcsr_acc_opencl_config_t c_dbcsr_acc_opencl_config;
 
 /** Determines host-pointer registration for modification. */
-c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory);
+c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(const void* memory);
 /** Determines device-pointer registration for modification (internal). */
 c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_devptr_modify(
   ACC_OPENCL_LOCKTYPE* lock, void* memory, size_t elsize, const size_t* amount, size_t* offset);
diff --git a/src/acc/opencl/acc_opencl_mem.c b/src/acc/opencl/acc_opencl_mem.c
index 41fc76519c0..7d9bd86a4d0 100644
--- a/src/acc/opencl/acc_opencl_mem.c
+++ b/src/acc/opencl/acc_opencl_mem.c
@@ -60,7 +60,7 @@ void c_dbcsr_acc_opencl_pfree(ACC_OPENCL_LOCKTYPE* lock, const void* pointer, vo
 }
 
 
-c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(void* memory) {
+c_dbcsr_acc_opencl_info_memptr_t* c_dbcsr_acc_opencl_info_hostptr(const void* memory) {
   assert(NULL == memory || sizeof(c_dbcsr_acc_opencl_info_memptr_t) <= (uintptr_t)memory);
   return (NULL != memory ? (c_dbcsr_acc_opencl_info_memptr_t*)((uintptr_t)memory - sizeof(c_dbcsr_acc_opencl_info_memptr_t))
                          : (c_dbcsr_acc_opencl_info_memptr_t*)NULL);
@@ -168,6 +168,8 @@ int c_dbcsr_acc_opencl_info_devptr(
 int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream) {
   const size_t size_meminfo = sizeof(c_dbcsr_acc_opencl_info_memptr_t);
   int result = EXIT_SUCCESS, alignment = sizeof(void*);
+  cl_mem_flags flags = CL_MEM_ALLOC_HOST_PTR;
+  void* host_ptr = NULL;
   cl_mem memory = NULL;
 #  if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
   int routine_handle;
@@ -186,12 +188,25 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
       EXIT_SUCCESS == c_dbcsr_acc_opencl_set_active_device(NULL /*lock*/, (int)c_dbcsr_acc_opencl_config.device.uid));
   }
 #  endif
-  memory = clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, CL_MEM_ALLOC_HOST_PTR, nbytes, NULL /*host_ptr*/, &result);
+#  if defined(ACC_OPENCL_XHINTS)
+  if (0 != (8 & c_dbcsr_acc_opencl_config.xhints) && (0 != c_dbcsr_acc_opencl_config.device.nv || NULL != (ACC_OPENCL_XHINTS))) {
+    host_ptr = malloc(nbytes);
+    if (NULL != host_ptr) flags = CL_MEM_USE_HOST_PTR;
+  }
+#  endif
+  memory = clCreateBuffer(c_dbcsr_acc_opencl_config.device.context, flags, nbytes, host_ptr, &result);
   if (EXIT_SUCCESS == result) {
-    const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream)
-                                                                   : c_dbcsr_acc_opencl_stream_default());
-    void* const mapped = clEnqueueMapBuffer(
-      str->queue, memory, CL_TRUE /*always block*/, CL_MAP_READ | CL_MAP_WRITE, 0 /*offset*/, nbytes, 0, NULL, NULL, &result);
+    void* mapped = host_ptr;
+    if (NULL == host_ptr) {
+      const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream)
+                                                                     : c_dbcsr_acc_opencl_stream_default());
+      mapped = clEnqueueMapBuffer(str->queue, memory, CL_TRUE /*always block*/,
+#  if defined(ACC_OPENCL_XHINTS) && (defined(CL_VERSION_1_2) || defined(CL_MAP_WRITE_INVALIDATE_REGION))
+        (4 & c_dbcsr_acc_opencl_config.xhints) ? CL_MAP_WRITE_INVALIDATE_REGION :
+#  endif
+                                               (CL_MAP_READ | CL_MAP_WRITE),
+        0 /*offset*/, nbytes, 0, NULL, NULL, &result);
+    }
     assert(EXIT_SUCCESS == result || NULL == mapped);
     if (EXIT_SUCCESS == result) {
       const uintptr_t address = (uintptr_t)mapped;
@@ -210,6 +225,7 @@ int c_dbcsr_acc_host_mem_allocate(void** host_mem, size_t nbytes, void* stream)
   if (EXIT_SUCCESS != result) {
     if (NULL != memory) ACC_OPENCL_EXPECT(EXIT_SUCCESS == clReleaseMemObject(memory));
     *host_mem = NULL;
+    free(host_ptr);
   }
   assert(EXIT_SUCCESS == result || NULL == *host_mem);
 #  if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
@@ -231,13 +247,25 @@ int c_dbcsr_acc_host_mem_deallocate(void* host_mem, void* stream) {
     c_dbcsr_acc_opencl_info_memptr_t* const meminfo = c_dbcsr_acc_opencl_info_hostptr(host_mem);
     if (NULL != meminfo->memory) {
       const c_dbcsr_acc_opencl_info_memptr_t info = *meminfo; /* copy meminfo prior to unmap */
-      const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream)
-                                                                     : c_dbcsr_acc_opencl_stream_default());
+      void* host_ptr = NULL;
       int result_release;
-      cl_event event;
-      assert(NULL != str && NULL != str->queue);
-      result = clEnqueueUnmapMemObject(str->queue, info.memory, info.memptr, 0, NULL, &event);
-      if (NULL == stream && EXIT_SUCCESS == result) result = clWaitForEvents(1, &event);
+#  if defined(ACC_OPENCL_XHINTS)
+      if (0 != (8 & c_dbcsr_acc_opencl_config.xhints) &&
+          (0 != c_dbcsr_acc_opencl_config.device.nv || NULL != (ACC_OPENCL_XHINTS)) &&
+          EXIT_SUCCESS == clGetMemObjectInfo(info.memory, CL_MEM_HOST_PTR, sizeof(void*), &host_ptr, NULL) && NULL != host_ptr)
+      {
+        free(host_ptr);
+      }
+      if (NULL == host_ptr)
+#  endif
+      {
+        const c_dbcsr_acc_opencl_stream_t* const str = (NULL != stream ? ACC_OPENCL_STREAM(stream)
+                                                                       : c_dbcsr_acc_opencl_stream_default());
+        cl_event event;
+        assert(NULL != str && NULL != str->queue);
+        result = clEnqueueUnmapMemObject(str->queue, info.memory, info.memptr, 0, NULL, &event);
+        if (NULL == stream && EXIT_SUCCESS == result) result = clWaitForEvents(1, &event);
+      }
       result_release = clReleaseMemObject(info.memory);
       if (EXIT_SUCCESS == result) result = result_release;
     }
@@ -256,14 +284,14 @@ int c_dbcsr_acc_opencl_memcpy_d2h(
 int c_dbcsr_acc_opencl_memcpy_d2h(
   cl_mem dev_mem, void* host_mem, size_t offset, size_t nbytes, cl_command_queue queue, int blocking) {
 #  if defined(ACC_OPENCL_ASYNC)
-  const cl_bool finish = (0 != blocking || 0 == (2 & c_dbcsr_acc_opencl_config.async) ||
-                          (0 != c_dbcsr_acc_opencl_config.device.nv && NULL == (ACC_OPENCL_ASYNC)));
+  const cl_bool finish = (0 != blocking || 0 == (2 & c_dbcsr_acc_opencl_config.async));
 #  else
   const cl_bool finish = CL_TRUE;
 #  endif
   int result = EXIT_SUCCESS;
 #  if defined(ACC_OPENCL_MEM_DEVPTR)
   if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) {
+    assert(0 == c_dbcsr_acc_opencl_config.device.unified);
     result = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(queue, finish, host_mem, dev_mem, nbytes, 0, NULL, NULL);
   }
   else
@@ -275,6 +303,7 @@ int c_dbcsr_acc_opencl_memcpy_d2h(
     int result_sync = EXIT_SUCCESS;
 #  if defined(ACC_OPENCL_MEM_DEVPTR)
     if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) {
+      assert(0 == c_dbcsr_acc_opencl_config.device.unified);
       result_sync = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(queue, CL_TRUE, host_mem, dev_mem, nbytes, 0, NULL, NULL);
     }
     else
@@ -316,6 +345,7 @@ int c_dbcsr_acc_dev_mem_allocate(void** dev_mem, size_t nbytes) {
   assert(NULL != dev_mem && NULL != context);
 #  if defined(ACC_OPENCL_MEM_DEVPTR)
   if (NULL != c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL) {
+    assert(0 == c_dbcsr_acc_opencl_config.device.unified);
     *dev_mem = memptr = c_dbcsr_acc_opencl_config.device.clDeviceMemAllocINTEL(
       context, c_dbcsr_acc_opencl_config.device.id, NULL /*properties*/, nbytes, 0 /*alignment*/, &result);
     if (EXIT_SUCCESS != result) *dev_mem = NULL;
@@ -409,6 +439,7 @@ int c_dbcsr_acc_dev_mem_deallocate(void* dev_mem) {
 #  else
     assert(NULL != c_dbcsr_acc_opencl_config.device.context);
     if (NULL != c_dbcsr_acc_opencl_config.device.clMemFreeINTEL) {
+      assert(0 == c_dbcsr_acc_opencl_config.device.unified);
       result = c_dbcsr_acc_opencl_config.device.clMemFreeINTEL(c_dbcsr_acc_opencl_config.device.context, dev_mem);
     }
     else {
@@ -479,14 +510,14 @@ int c_dbcsr_acc_memcpy_h2d(const void* host_mem, void* dev_mem, size_t nbytes, v
     const c_dbcsr_acc_opencl_stream_t* const str =
       (NULL != stream ? ACC_OPENCL_STREAM(stream) : c_dbcsr_acc_opencl_stream(NULL /*lock*/, ACC_OPENCL_OMP_TID()));
 #  if defined(ACC_OPENCL_ASYNC)
-    const cl_bool finish = (0 == (1 & c_dbcsr_acc_opencl_config.async) || NULL == stream ||
-                            (0 != c_dbcsr_acc_opencl_config.device.nv && NULL == (ACC_OPENCL_ASYNC)));
+    const cl_bool finish = (0 == (1 & c_dbcsr_acc_opencl_config.async) || NULL == stream);
 #  else
         const cl_bool finish = CL_TRUE;
 #  endif
     assert(NULL != str && NULL != str->queue);
 #  if defined(ACC_OPENCL_MEM_DEVPTR)
     if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) {
+      assert(0 == c_dbcsr_acc_opencl_config.device.unified);
       result = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(str->queue, finish, dev_mem, host_mem, nbytes, 0, NULL, NULL);
     }
     else
@@ -566,6 +597,7 @@ int c_dbcsr_acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t nbyt
 #  if defined(ACC_OPENCL_MEM_DEVPTR)
     assert(NULL != c_dbcsr_acc_opencl_config.device.context);
     if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL) {
+      assert(0 == c_dbcsr_acc_opencl_config.device.unified);
       result = c_dbcsr_acc_opencl_config.device.clEnqueueMemcpyINTEL(
         str->queue, CL_FALSE /*blocking*/, devmem_dst, devmem_src, nbytes, 0, NULL, &event);
     }
@@ -616,6 +648,7 @@ int c_dbcsr_acc_opencl_memset(void* dev_mem, int value, size_t offset, size_t nb
 #  if defined(ACC_OPENCL_MEM_DEVPTR)
     assert(NULL != c_dbcsr_acc_opencl_config.device.context);
     if (NULL != c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL) {
+      assert(0 == c_dbcsr_acc_opencl_config.device.unified);
       result = c_dbcsr_acc_opencl_config.device.clEnqueueMemFillINTEL(
         str->queue, (char*)dev_mem + offset, &value, size_of_value, nbytes, 0, NULL, &event);
     }
diff --git a/src/acc/opencl/acc_opencl_stream.c b/src/acc/opencl/acc_opencl_stream.c
index 41297015ba8..29ade32dba8 100644
--- a/src/acc/opencl/acc_opencl_stream.c
+++ b/src/acc/opencl/acc_opencl_stream.c
@@ -117,6 +117,7 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
   if (NULL != c_dbcsr_acc_opencl_config.device.context)
 #  endif
   {
+#  if defined(ACC_OPENCL_XHINTS)
     if ((2 & c_dbcsr_acc_opencl_config.xhints) && 0 != c_dbcsr_acc_opencl_config.device.intel) { /* enable queue families */
       struct {
         cl_command_queue_properties properties;
@@ -141,6 +142,7 @@ int c_dbcsr_acc_stream_create(void** stream_p, const char* name, int priority) {
         }
       }
     }
+#  endif
     if ((c_dbcsr_acc_opencl_timer_device == c_dbcsr_acc_opencl_config.timer) &&
         (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity))
     {
diff --git a/src/acc/opencl/smm/opencl_libsmm.c b/src/acc/opencl/smm/opencl_libsmm.c
index 97a0e84a891..409659980cb 100644
--- a/src/acc/opencl/smm/opencl_libsmm.c
+++ b/src/acc/opencl/smm/opencl_libsmm.c
@@ -28,15 +28,20 @@
       libxsmm_gemm_descriptor_dinit(BLOB, PREC, M, N, K, LDA, LDB, LDC, 1.0, 1.0, FLAGS, PREFETCH)
 #  endif
 
-#  if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && defined(OPENCL_LIBSMM_VALIDATE) && \
-    (1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE)
-#    define OPENCL_LIBSMM_VALIDATE_TRANS
-#  endif
-#  if !defined(OPENCL_LIBSMM_VALIDATE_SMM) && defined(OPENCL_LIBSMM_VALIDATE)
-#    define OPENCL_LIBSMM_VALIDATE_SMM
-#  endif
-#  if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && defined(OPENCL_LIBSMM_VALIDATE) && 1
-#    define OPENCL_LIBSMM_VALIDATE_EXIT
+#  if defined(OPENCL_LIBSMM_VALIDATE)
+#    if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && (1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE)
+#      define OPENCL_LIBSMM_VALIDATE_TRANS
+#    endif
+#    if !defined(OPENCL_LIBSMM_VALIDATE_SMM)
+#      define OPENCL_LIBSMM_VALIDATE_SMM
+#    endif
+#    if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && 1
+#      define OPENCL_LIBSMM_VALIDATE_EXIT
+#    endif
+#    if !defined(OPENCL_LIBSMM_VALIDATE_SCRATCH)
+#      define OPENCL_LIBSMM_VALIDATE_SCRATCH(SIZE, ALIGN) /*libxsmm_aligned_scratch(SIZE, ALIGN)*/ malloc(SIZE)
+#      define OPENCL_LIBSMM_VALIDATE_FREE(PTR) /*libxsmm_free(PTR)*/ free(PTR)
+#    endif
 #  endif
 #  if !defined(OPENCL_LIBSMM_KERNELNAME_TRANS)
 #    define OPENCL_LIBSMM_KERNELNAME_TRANS "trans"
@@ -111,31 +116,6 @@ int opencl_libsmm_use_cmem(cl_device_id device) {
 }
 
 
-#  if defined(OPENCL_LIBSMM_VALIDATE) && (0 != OPENCL_LIBSMM_VALIDATE)
-void opencl_libsmm_print_matrix(FILE* ostream, const char* label, libsmm_acc_data_t type, const void* mat, int m, int n) {
-  int i, j;
-  const char* const s = (NULL != label ? label : "");
-  const int len = (int)strlen(s);
-  for (i = 0; i < m; ++i) {
-    if (0 < i) {
-      fprintf(ostream, "%*s", len, " ");
-    }
-    else {
-      fprintf(ostream, "%s", s);
-    }
-    for (j = 0; j < n; ++j) {
-      switch (type) {
-        case dbcsr_type_real_8: fprintf(ostream, "%.2f ", ((const double*)mat)[i * n + j]); break;
-        case dbcsr_type_real_4: fprintf(ostream, "%.2f ", ((const float*)mat)[i * n + j]); break;
-        default: fprintf(ostream, "? ");
-      }
-    }
-    fprintf(ostream, "\n");
-  }
-}
-#  endif
-
-
 int opencl_libsmm_write_trans_params(FILE* stream, int only_key, const opencl_libsmm_transkey_t* key,
   const opencl_libsmm_trans_t* config, const char* delim, const char* begin, const char* close) {
   int result = 0;
@@ -209,7 +189,7 @@ int opencl_libsmm_read_smm_params(char* parambuf, opencl_libsmm_smmkey_t* key, o
   LIBXSMM_MEMZERO127(key); /* potentially heterogeneous key-data (alignment gaps) */
   memset(value, 0, sizeof(opencl_libsmm_smm_t));
   for (; NULL != s;
-       ++i, s = (c != consumed ? ((s + 1) < end ? strtok((s + 1) + strlen(s), ACC_OPENCL_DELIMS) : NULL) : s), c = consumed)
+    ++i, s = (c != consumed ? ((s + 1) < end ? strtok((s + 1) + strlen(s), ACC_OPENCL_DELIMS) : NULL) : s), c = consumed)
   {
     switch (i) {
       case 0:
@@ -521,7 +501,9 @@ int libsmm_acc_init(void) {
                     memcpy(config_init, &config, sizeof(config));
                   }
 #    if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
-                  if (active_match == i && c_dbcsr_acc_opencl_config.device.uid != key.devuid) {
+                  if (active_match == i && 0 != c_dbcsr_acc_opencl_config.device.uid &&
+                      c_dbcsr_acc_opencl_config.device.uid != key.devuid)
+                  {
                     key.devuid = c_dbcsr_acc_opencl_config.device.uid;
                     config_init = (opencl_libsmm_smm_t*)libxsmm_xdispatch(&key, sizeof(key));
                     if (NULL == config_init && NULL != libxsmm_xregister(&key, sizeof(key), sizeof(config), &config)) {
@@ -786,7 +768,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
         const size_t scratch_size = (sizeof(int) * offset_stack_size) /*stack*/
                                     + data_size /*imat*/ + data_size /*omat*/ + (mn * typesize) /*gold*/
                                     + 3 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
-        scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
+        scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT);
         if (NULL != scratch) {
           stack = (int*)scratch;
           imat = (char*)LIBXSMM_UP2((uintptr_t)stack + sizeof(int) * offset_stack_size, LIBXSMM_ALIGNMENT);
@@ -855,20 +837,15 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
       }
 #  if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
       ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream), "transfer validation test", result);
-#  endif
-#  if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
       ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result);
-#  endif
-#  if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
       if (EXIT_SUCCESS == result) {
-        int i, j;
-        LIBXSMM_STDIO_ACQUIRE();
+        char print_buffer[2048] = "";
+        int print_offset = 0, i, j;
         if (0 != c_dbcsr_acc_opencl_config.verbosity) {
-          fprintf(stderr,
-            "libsmm_acc_transpose("
-            "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
-            offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n,
-            max_kernel_dim, stream);
+          print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
+            "libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
+            dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim,
+            stream);
         }
         for (i = offset; i < offset_stack_size; ++i) {
           const size_t index = stack[i];
@@ -879,20 +856,12 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
           libxsmm_itrans(gold, typesize, m, n, m, n);
           if (0 != memcmp(gold, test, mn * typesize)) {
             if (0 == c_dbcsr_acc_opencl_config.verbosity) {
-              fprintf(stderr,
-                "libsmm_acc_transpose("
-                "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
-                offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m,
-                n, max_kernel_dim, stream);
-            }
-            fprintf(stderr, " => ERROR\n");
-            if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
-              fprintf(stderr, "stackposition = %i (index=%llu)\n", i, (unsigned long long)index);
-              opencl_libsmm_print_matrix(stderr, "orig = ", datatype, orig, m, n);
-              opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold, n, m);
-              opencl_libsmm_print_matrix(stderr, "test = ", datatype, test, n, m);
-              fprintf(stderr, "\n");
+              print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
+                "libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
+                dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim,
+                stream);
             }
+            print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n");
 #    if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
             exit(EXIT_FAILURE);
 #    else
@@ -903,7 +872,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
           for (j = offset; j < i; ++j) {
             const size_t duplicate = stack[j];
             if (index == duplicate) {
-              fprintf(stderr, " => ERROR\n");
+              print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n");
 #    if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
               exit(EXIT_FAILURE);
 #    else
@@ -915,8 +884,10 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
           }
         }
         if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
-          fprintf(stderr, " => OK\n");
+          print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n");
         }
+        LIBXSMM_STDIO_ACQUIRE();
+        fputs(print_buffer, stderr);
         LIBXSMM_STDIO_RELEASE();
       }
       libxsmm_free(scratch);
@@ -1113,7 +1084,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
               (NULL == env_nz || '\0' == *env_nz) ? (0 != defaults ? /*default*/ 0 : config->nz) : atoi(env_nz), 0, 1);
             new_config.al = LIBXSMM_CLMP(/* bug: AL=1 */
               (NULL == env_al || '\0' == *env_al)
-                ? (0 == (32 & c_dbcsr_acc_opencl_config.wa) ? (0 != defaults ? 0 : config->al) : 0)
+                ? (0 == (64 & c_dbcsr_acc_opencl_config.wa) ? (0 != defaults ? 0 : config->al) : 0)
                 : atoi(env_al),
               0, 1);
             new_config.tb = LIBXSMM_CLMP(
@@ -1124,7 +1095,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
               (NULL == env_ap || '\0' == *env_ap) ? (0 != defaults ? /*default*/ 0 : config->ap) : atoi(env_ap), 0, 1);
             new_config.aa = LIBXSMM_CLMP(/* bug: AA=2 XF=1 */
               (NULL == env_aa || '\0' == *env_aa) ? (0 != defaults ? default_aa : config->aa) : atoi(env_aa), 0,
-              (0 == (64 & c_dbcsr_acc_opencl_config.wa) || 0 == new_config.flags) ? 2 : 1);
+              (0 == (32 & c_dbcsr_acc_opencl_config.wa) || 0 == new_config.flags) ? 2 : 1);
             new_config.ab = LIBXSMM_CLMP(
               (NULL == env_ab || '\0' == *env_ab) ? (0 != defaults ? default_ab : config->ab) : atoi(env_ab), 0, 2);
             new_config.ac = LIBXSMM_CLMP(
@@ -1342,7 +1313,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
             &blob, precision, m_max, n_max, k_max, m_max, k_max, m_max, LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE);
           const size_t scratch_size = psize + asize + bsize + csize + csize + k_max * n_max * typesize +
                                       5 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
-          scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
+          scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT);
           if (NULL != desc && NULL != scratch) {
             pinp = (int*)scratch;
             ainp = (char*)LIBXSMM_UP2((uintptr_t)pinp + psize, LIBXSMM_ALIGNMENT);
@@ -1429,10 +1400,12 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
           const char* const env_tol = getenv("OPENCL_LIBSMM_SMM_TOLERANCE");
           const double tolerance = ((NULL == env_tol || '\0' == *env_tol) ? 1E-3 : atof(env_tol));
           const int* const params = pinp + (4 <= nparams ? (nparams - 4) : 0);
+          char print_buffer[2048] = "";
+          int print_offset = 0;
           size_t i;
-          LIBXSMM_STDIO_ACQUIRE();
           if (0 != c_dbcsr_acc_opencl_config.verbosity) {
-            fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
+            print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
+              "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
               dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
               max_kernel_dim, stream);
           }
@@ -1458,20 +1431,21 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
 #    endif
             if (tolerance < epsilon) {
               if (0 == c_dbcsr_acc_opencl_config.verbosity) {
-                fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
+                print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
+                  "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
                   dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
                   max_kernel_dim, stream);
               }
 #    if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
-              fprintf(stderr, " => ERROR diff=%g (%g != %g)\n", diff.linf_abs, diff.v_ref, diff.v_tst);
-#    else
-              fprintf(stderr, " => ERROR diff=%g\n", diff.linf_abs);
+              if (LIBXSMM_NOTNAN(diff.v_tst)) {
+                print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
+                  " => ERROR diff=%g (|%g-%g|=%g)\n", epsilon, diff.v_ref, diff.v_tst, diff.linf_abs);
+              }
+              else
 #    endif
-              if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
-                fprintf(stderr, "stackposition = %llu (index=%llu)\n", (unsigned long long)i, (unsigned long long)ic);
-                opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold + ic, m_max, n_max);
-                opencl_libsmm_print_matrix(stderr, "test = ", datatype, test + ic, m_max, n_max);
-                fprintf(stderr, "\n");
+              {
+                print_offset += LIBXSMM_SNPRINTF(
+                  print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR diff=%g\n", epsilon);
               }
 #    if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
               exit(EXIT_FAILURE);
@@ -1482,8 +1456,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
             }
           }
           if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
-            fprintf(stderr, " => OK\n");
+            print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n");
           }
+          LIBXSMM_STDIO_ACQUIRE();
+          fputs(print_buffer, stderr);
           LIBXSMM_STDIO_RELEASE();
         }
         libxsmm_free(scratch);
diff --git a/src/acc/opencl/smm/params/tune_multiply_GH200.csv b/src/acc/opencl/smm/params/tune_multiply_GH200.csv
new file mode 100644
index 00000000000..7275e0e771f
--- /dev/null
+++ b/src/acc/opencl/smm/params/tune_multiply_GH200.csv
@@ -0,0 +1,317 @@
+DEVICE;TYPEID;M;N;K;S;GFLOPS;BS;BM;BN;BK;WS;WG;LU;NZ;AL;TB;TC;AP;AA;AB;AC
+NVIDIA GH200 480GB [0x3528];3;2;2;2;30000;0;11;2;1;2;2;1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;3;3;3;30000;0;12;3;1;2;3;1;0;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;4;30000;0;14;4;1;2;4;-1;1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;5;30000;0;13;4;1;2;4;1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;7;30000;0;12;4;1;3;4;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;9;30000;0;12;4;1;4;4;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;10;30000;0;5;4;1;4;4;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;13;30000;0;4;4;1;3;4;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;15;30000;0;4;4;1;3;4;0;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;17;30000;0;4;4;1;4;4;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;25;30000;0;19;4;1;3;4;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;26;30000;0;3;4;1;2;4;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;28;30000;0;3;4;1;2;4;-1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;4;32;30000;0;3;4;1;2;4;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;4;30000;0;15;4;1;2;5;0;0;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;5;30000;0;13;4;1;2;5;0;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;7;30000;0;13;4;1;4;5;1;-1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;9;30000;0;10;4;1;2;5;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;13;30000;0;5;4;1;2;5;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;17;30000;0;4;4;1;4;5;-1;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;25;30000;0;12;4;1;2;5;-1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;5;32;30000;0;3;4;1;4;5;-1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;7;4;30000;0;15;4;1;3;7;-2;-1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;7;5;30000;0;12;4;1;2;7;0;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;7;7;30000;0;13;4;1;2;7;0;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;7;9;30000;0;5;4;1;4;7;1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;7;13;30000;0;4;4;1;4;7;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;9;4;30000;0;13;4;1;3;9;1;-2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;9;5;30000;0;13;4;1;2;9;-2;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;9;7;30000;0;13;4;1;4;9;0;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;9;9;30000;0;10;4;1;2;9;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;9;13;30000;0;5;4;1;4;9;-2;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;10;4;30000;0;13;4;1;4;10;1;0;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;10;10;30000;0;10;4;1;4;10;1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;13;4;30000;0;16;4;1;2;13;0;-1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;13;5;30000;0;13;4;1;2;13;1;-2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;13;7;30000;0;15;4;1;1;1;0;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;13;9;30000;0;10;4;1;2;1;0;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;13;13;30000;0;12;4;1;4;1;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;13;17;30000;0;10;4;1;4;1;-2;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;13;32;30000;0;12;4;1;2;1;-1;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;15;4;30000;0;15;4;1;1;1;1;0;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;17;4;30000;0;16;4;1;3;1;1;-1;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;17;5;30000;0;13;4;1;2;1;0;-1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;17;13;30000;0;14;4;1;2;1;-1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;17;17;30000;0;9;4;1;4;1;-1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;17;32;30000;0;12;4;1;4;1;1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;25;4;30000;0;14;4;1;3;1;0;-1;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;25;5;30000;0;15;4;1;2;25;1;-2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;26;4;30000;0;17;4;1;3;1;-1;-1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;28;4;30000;0;14;4;1;3;1;-1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;32;4;30000;0;18;4;1;1;1;0;-2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;4;32;5;30000;0;14;4;1;2;1;-2;-2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;32;13;30000;0;11;4;1;2;1;1;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;32;17;30000;0;8;4;1;2;1;0;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;4;32;32;30000;0;13;4;1;4;1;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;4;30000;0;14;5;1;2;1;-1;-1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;5;30000;0;13;5;1;4;1;-2;-1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;7;30000;0;13;5;1;2;1;0;-2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;9;30000;0;11;5;1;1;1;-2;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;13;30000;0;12;5;1;4;5;-1;-1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;17;30000;0;12;5;1;2;5;1;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;25;30000;0;12;5;1;4;5;0;-1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;4;32;30000;0;12;5;1;2;5;-1;1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;4;30000;0;15;5;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;5;30000;0;12;5;1;2;1;1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;7;30000;0;12;5;1;3;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;9;30000;0;12;5;1;2;1;-2;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;13;30000;0;5;5;1;4;1;-2;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;17;30000;0;4;5;1;4;1;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;28;30000;0;19;5;1;3;5;-1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;5;32;30000;0;3;5;1;3;1;1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;7;4;30000;0;15;5;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;7;5;30000;0;13;5;1;4;1;-2;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;7;7;30000;0;10;5;1;2;1;-1;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;7;9;30000;0;10;5;1;2;1;0;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;7;13;30000;0;10;5;1;3;1;-1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;9;4;30000;0;16;5;1;2;1;-1;1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;9;5;30000;0;13;5;1;2;1;1;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;9;7;30000;0;9;5;1;2;1;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;9;9;30000;0;10;5;1;2;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;13;4;30000;0;16;5;1;2;1;0;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;13;5;30000;0;15;5;1;2;1;1;-2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;13;7;30000;0;8;5;1;3;1;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;13;13;30000;0;10;5;1;2;1;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;13;17;30000;0;12;5;1;2;1;0;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;13;28;30000;0;5;5;1;2;13;-2;-1;0;1;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;13;32;30000;0;4;5;1;5;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;17;4;30000;0;18;5;1;2;1;0;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;17;5;30000;0;13;5;1;5;1;-2;-1;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;17;13;30000;0;10;5;1;2;1;-2;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;17;17;30000;0;16;5;1;2;1;-2;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;17;32;30000;0;10;5;1;2;1;-1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;25;4;30000;0;18;5;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;32;4;30000;0;18;5;1;2;1;1;0;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;5;32;5;30000;0;15;5;1;2;1;1;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;32;13;30000;0;11;5;1;4;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;32;17;30000;0;11;5;1;5;1;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;5;32;32;30000;0;12;5;1;4;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;6;6;6;30000;0;10;6;1;2;1;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;6;6;7;30000;0;12;6;1;6;1;-1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;6;6;8;30000;0;12;6;1;3;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;6;7;6;30000;0;8;6;1;2;1;-2;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;6;7;7;30000;0;10;6;1;2;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;6;7;8;30000;0;10;6;1;3;1;-2;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;6;8;6;30000;0;8;6;1;2;1;-2;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;6;8;7;30000;0;10;6;1;6;1;1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;6;8;8;30000;0;10;6;1;2;1;0;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;4;4;30000;0;13;7;1;2;1;1;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;4;5;30000;0;11;7;1;2;1;0;-2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;4;7;30000;0;13;7;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;4;9;30000;0;11;7;1;3;1;-1;2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;4;13;30000;0;12;7;1;7;7;0;-1;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;5;4;30000;0;12;7;1;1;1;0;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;5;5;30000;0;11;7;1;3;1;0;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;5;7;30000;0;12;7;1;4;7;-2;0;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;5;9;30000;0;12;7;1;3;1;-2;-1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;5;13;30000;0;12;7;1;3;1;-1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;6;6;30000;0;12;7;1;3;1;0;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;6;7;30000;0;12;7;1;3;1;-1;2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;6;8;30000;0;12;7;1;2;1;1;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;7;4;30000;0;15;7;1;4;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;7;5;30000;0;12;7;1;5;1;-2;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;7;6;30000;0;10;7;1;4;1;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;7;7;30000;0;10;7;1;6;1;0;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;7;8;30000;0;12;7;1;5;1;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;7;9;30000;0;10;7;1;4;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;7;13;30000;0;12;7;1;7;1;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;8;6;30000;0;10;7;1;2;1;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;8;7;30000;0;12;7;1;3;1;0;4;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;8;8;30000;0;10;7;1;7;1;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;9;4;30000;0;14;7;1;1;1;-2;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;9;5;30000;0;10;7;1;4;1;-1;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;9;7;30000;0;12;7;1;5;1;1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;13;4;30000;0;14;7;1;4;1;1;-1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;13;5;30000;0;13;7;1;5;1;-1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;7;13;7;30000;0;12;7;1;4;1;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;7;13;13;30000;0;10;7;1;6;1;-2;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;8;6;6;30000;0;11;8;1;7;1;1;0;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;8;6;7;30000;0;12;8;1;6;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;8;6;8;30000;0;11;8;1;1;1;-1;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;8;7;6;30000;0;12;8;1;7;1;-1;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;8;7;7;30000;0;12;8;1;7;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;8;7;8;30000;0;11;8;1;1;1;-1;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;8;8;6;30000;0;10;8;1;2;1;0;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;8;8;7;30000;0;12;8;1;2;1;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;8;8;8;30000;0;12;8;1;2;1;-2;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;4;4;30000;0;13;9;1;2;1;0;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;4;5;30000;0;13;9;1;7;1;0;0;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;4;7;30000;0;12;9;1;7;1;-2;0;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;4;9;30000;0;10;8;1;7;9;1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;4;13;30000;0;12;9;1;9;9;-1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;5;4;30000;0;11;9;1;4;1;-2;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;5;5;30000;0;13;9;1;9;1;0;0;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;5;7;30000;0;10;8;1;2;9;1;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;5;9;30000;0;5;9;1;8;1;1;2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;7;4;30000;0;12;9;1;2;1;-2;-1;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;7;5;30000;0;8;8;1;9;1;1;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;7;7;30000;0;10;8;1;3;1;-1;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;9;4;30000;0;17;9;1;7;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;9;5;30000;0;17;9;1;4;1;-2;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;9;9;30000;0;12;9;1;4;1;-2;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;9;16;30000;0;19;9;1;8;1;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;9;22;30000;0;12;8;1;7;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;9;32;30000;0;6;9;1;8;1;-2;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;13;4;30000;0;13;9;1;2;1;-2;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;16;9;30000;0;16;9;1;7;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;16;16;30000;0;10;8;1;2;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;16;22;30000;0;6;8;1;2;1;-2;2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;22;9;30000;0;16;9;1;7;1;0;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;9;22;16;30000;0;12;9;1;5;1;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;22;22;30000;0;15;9;1;8;1;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;22;32;30000;0;15;9;1;7;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;32;9;30000;0;16;9;1;6;32;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;32;22;30000;0;10;9;1;6;32;1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;9;32;32;30000;0;12;9;1;2;32;0;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;10;4;4;30000;0;8;8;1;5;1;0;1;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;10;4;10;30000;0;10;8;1;3;10;0;3;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;10;10;4;30000;0;17;10;1;7;1;0;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;10;10;10;30000;0;12;10;1;7;1;1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;4;4;30000;0;8;8;1;7;1;1;-1;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;4;5;30000;0;8;8;1;2;13;-1;2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;4;7;30000;0;8;8;1;10;1;0;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;4;9;30000;0;9;8;1;10;13;0;2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;4;13;30000;0;6;8;1;9;13;-2;6;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;4;17;30000;0;5;8;1;7;13;0;4;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;4;32;30000;0;12;8;1;13;1;0;2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;5;4;30000;0;8;8;1;8;1;-2;3;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;5;5;30000;0;8;8;1;3;1;1;2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;5;7;30000;0;8;8;1;10;1;1;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;5;13;30000;0;9;8;1;5;13;1;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;5;17;30000;0;12;8;1;7;1;-1;-1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;5;28;30000;0;3;13;1;3;13;-2;2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;5;32;30000;0;3;8;1;3;1;0;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;7;4;30000;0;11;8;1;4;1;1;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;7;5;30000;0;9;8;1;4;1;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;7;7;30000;0;13;8;1;13;1;-2;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;7;13;30000;0;12;8;1;4;1;0;1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;9;4;30000;0;12;8;1;11;1;1;2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;13;4;30000;0;17;13;1;11;1;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;13;5;30000;0;17;13;1;9;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;13;7;30000;0;16;13;1;4;13;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;13;13;30000;0;12;13;1;6;1;-1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;13;17;30000;0;15;13;1;10;1;1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;13;28;30000;0;19;13;1;13;13;-1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;13;32;30000;0;19;13;1;9;1;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;17;4;30000;0;17;13;1;10;1;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;17;5;30000;0;17;13;1;6;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;17;13;30000;0;16;13;1;6;1;1;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;17;17;30000;0;15;13;1;3;1;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;17;32;30000;0;20;13;1;3;1;1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;32;4;30000;0;21;8;1;13;1;1;2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;32;5;30000;0;17;13;1;5;1;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;32;13;30000;0;18;13;1;2;1;-1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;13;32;17;30000;0;19;8;1;4;1;0;3;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;13;32;32;30000;0;15;13;1;11;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;14;14;14;30000;0;16;14;1;14;14;1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;14;14;55;30000;0;20;14;1;7;14;-1;2;0;1;0;1;0;1;1;0;0
+NVIDIA GH200 480GB [0x3528];3;14;55;14;30000;0;26;14;1;7;55;-2;1;0;1;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;14;55;55;30000;0;60;14;1;10;55;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;15;4;4;30000;0;8;8;1;3;1;1;2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;15;15;15;30000;0;21;15;1;10;1;-1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;16;9;9;30000;0;12;8;1;16;1;-2;3;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;16;9;16;30000;0;10;8;1;14;16;1;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;16;9;22;30000;0;12;8;1;14;16;-2;0;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;16;16;9;30000;0;16;8;1;3;1;0;2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;16;16;16;30000;0;12;8;1;13;1;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;16;16;22;30000;0;21;16;1;13;1;-2;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;16;22;9;30000;0;16;16;1;8;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;16;22;16;30000;0;12;16;1;15;1;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;16;22;22;30000;0;22;16;1;14;1;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;4;4;30000;0;8;8;1;9;1;-2;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;4;5;30000;0;10;8;1;14;1;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;4;13;30000;0;5;8;1;3;17;0;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;4;17;30000;0;12;8;1;14;17;0;-2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;4;32;30000;0;3;8;1;2;17;-1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;5;4;30000;0;8;8;1;15;1;1;2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;5;5;30000;0;8;8;1;15;17;-2;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;5;13;30000;0;10;8;1;7;17;0;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;5;17;30000;0;12;8;1;9;17;-2;-1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;5;32;30000;0;16;8;1;2;17;-1;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;13;4;30000;0;14;17;1;11;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;13;5;30000;0;13;17;1;5;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;13;13;30000;0;19;8;1;16;1;-1;2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;13;17;30000;0;17;17;1;7;1;1;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;13;32;30000;0;19;17;1;16;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;17;4;30000;0;23;17;1;7;1;-2;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;17;5;30000;0;23;17;1;15;1;-1;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;17;13;30000;0;21;17;1;8;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;17;17;30000;0;16;17;1;9;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;17;32;30000;0;5;17;1;3;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;32;4;30000;0;23;17;1;11;1;-2;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;32;5;30000;0;22;17;1;6;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;32;13;30000;0;15;17;1;9;1;-2;-1;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;17;32;17;30000;0;15;17;1;13;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;17;32;32;30000;0;20;17;1;17;1;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;20;20;20;30000;0;19;20;1;8;1;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;9;9;30000;0;10;8;1;9;1;0;3;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;22;9;16;30000;0;10;8;1;21;1;-1;1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;9;22;30000;0;12;8;1;6;22;1;0;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;9;32;30000;0;20;8;1;20;22;0;1;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;16;9;30000;0;19;8;1;5;1;-2;4;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;16;16;30000;0;19;8;1;15;1;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;16;22;30000;0;12;8;1;3;1;0;2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;22;9;30000;0;23;22;1;17;1;0;0;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;22;22;16;30000;0;20;22;1;5;1;1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;22;22;30000;0;23;22;1;14;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;22;32;30000;0;30;22;1;7;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;32;9;30000;0;17;22;1;20;32;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;32;22;30000;0;20;22;1;9;32;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;22;32;32;30000;0;40;22;1;17;32;-1;3;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;23;23;23;30000;0;20;23;1;16;23;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;24;24;24;30000;0;20;24;1;16;1;-2;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;25;4;4;30000;0;8;8;1;13;25;0;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;25;4;5;30000;0;8;8;1;18;25;0;5;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;25;5;4;30000;0;8;8;1;21;1;-2;1;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;25;25;25;30000;0;24;25;1;21;1;-1;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;26;4;4;30000;0;10;8;1;6;26;-1;3;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;28;4;4;30000;0;11;8;1;12;1;-1;5;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;28;28;28;30000;0;6;16;1;8;1;-2;2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;30;30;30;30000;0;50;30;1;19;1;-1;-2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;32;4;4;30000;0;11;8;1;15;32;-1;-2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;32;4;5;30000;0;11;8;1;16;32;1;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;4;13;30000;0;10;8;1;1;32;1;0;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;32;4;17;30000;0;12;8;1;9;32;1;0;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;4;32;30000;0;6;8;1;21;32;1;4;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;5;4;30000;0;14;8;1;14;1;1;-2;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;32;5;5;30000;0;15;8;1;10;1;-2;-2;0;0;0;1;0;2;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;5;13;30000;0;10;8;1;31;32;1;-2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;5;17;30000;0;12;8;1;3;32;0;4;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;5;32;30000;0;15;8;1;25;32;0;0;0;0;0;1;0;2;0;0;0
+NVIDIA GH200 480GB [0x3528];3;32;13;4;30000;0;23;8;1;4;1;-1;-2;0;0;0;1;0;0;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;13;5;30000;0;23;8;1;4;1;-1;-2;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;32;13;13;30000;0;20;8;1;18;1;-1;5;0;0;0;1;0;0;0;0;0
+NVIDIA GH200 480GB [0x3528];3;32;13;17;30000;0;30;8;1;5;1;-2;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;13;32;30000;0;13;8;1;16;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;17;4;30000;0;23;32;1;17;1;-1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;17;5;30000;0;18;32;1;14;1;-1;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;17;13;30000;0;20;32;1;11;1;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;17;17;30000;0;20;32;1;4;1;-2;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;17;32;30000;0;10;8;1;23;1;1;4;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;32;4;30000;0;26;32;1;4;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;32;5;30000;0;29;32;1;11;1;1;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;32;9;30000;0;23;32;1;21;32;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;32;13;30000;0;30;32;1;6;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;32;17;30000;0;23;32;1;22;1;0;-1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;32;22;30000;0;20;32;1;27;32;0;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;32;32;32;30000;0;19;32;1;29;1;-1;-2;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;35;35;35;30000;0;57;24;1;2;1;0;2;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;36;36;36;30000;0;15;36;1;2;1;-1;3;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;40;40;40;30000;0;57;16;1;6;1;-1;1;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;55;14;14;30000;0;23;55;1;29;55;-1;4;0;0;0;1;0;1;0;0;0
+NVIDIA GH200 480GB [0x3528];3;55;14;55;30000;0;8;55;1;2;55;1;7;0;0;0;1;0;1;1;0;0
+NVIDIA GH200 480GB [0x3528];3;55;55;14;30000;0;60;55;1;51;55;0;-2;0;1;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;55;55;55;30000;0;13;55;1;5;55;0;0;0;0;0;1;0;1;2;0;0
+NVIDIA GH200 480GB [0x3528];3;64;64;64;30000;0;5;64;1;18;64;0;6;0;0;0;1;0;1;0;0;0
diff --git a/src/acc/opencl/smm/params/tune_multiply_Mi250.csv b/src/acc/opencl/smm/params/tune_multiply_Mi250.csv
index 4410500501c..d73036e2ca9 100644
--- a/src/acc/opencl/smm/params/tune_multiply_Mi250.csv
+++ b/src/acc/opencl/smm/params/tune_multiply_Mi250.csv
@@ -281,6 +281,7 @@ gfx90a [0x989f];3;12;23;12;30000;0;20;12;1;1;1;0;-2;0;0;1;1;1;2;2;1;0
 gfx90a [0x989f];3;12;23;23;30000;0;3;8;1;8;1;-2;-2;0;0;0;1;0;1;2;0;0
 gfx90a [0x989f];3;13;13;13;30000;0;12;13;1;11;1;0;-1;0;1;0;1;1;0;2;1;0
 gfx90a [0x989f];3;13;13;23;30000;0;12;13;1;1;1;-2;-2;0;1;0;1;1;2;0;1;0
+gfx90a [0x989f];3;13;13;32;30000;0;3;13;1;9;13;0;2;0;0;0;1;0;1;2;0;0
 gfx90a [0x989f];3;13;23;13;30000;0;30;8;1;10;1;1;2;0;0;0;1;0;1;0;0;0
 gfx90a [0x989f];3;13;23;23;30000;0;3;8;1;6;1;-1;2;0;0;0;1;0;1;2;0;0
 gfx90a [0x989f];3;14;14;14;30000;0;10;14;1;10;1;0;0;0;1;1;1;1;0;2;0;0
@@ -351,6 +352,7 @@ gfx90a [0x989f];3;18;23;23;30000;0;4;8;1;16;1;0;3;0;0;0;1;0;1;2;0;0
 gfx90a [0x989f];3;19;19;19;30000;0;40;8;1;10;1;1;3;0;0;0;1;0;1;2;0;0
 gfx90a [0x989f];3;19;19;23;30000;0;40;8;1;15;1;-1;-2;0;0;0;1;0;1;0;0;0
 gfx90a [0x989f];3;23;23;23;30000;0;4;8;1;22;23;-1;3;0;0;0;1;0;1;0;0;0
+gfx90a [0x989f];3;28;28;28;30000;0;3;28;1;28;28;-2;2;0;0;0;1;0;1;0;0;0
 gfx90a [0x989f];3;32;32;32;30000;0;25;32;1;20;1;-2;0;0;1;0;1;0;2;0;0;0
 gfx90a [0x989f];3;35;17;17;30000;0;15;35;1;29;1;1;0;0;1;0;1;0;2;1;0;0
 gfx90a [0x989f];3;35;17;32;30000;0;20;35;1;1;1;0;-2;1;1;0;1;1;2;0;0;0
diff --git a/src/acc/opencl/smm/params/tune_multiply_PVC.csv b/src/acc/opencl/smm/params/tune_multiply_PVC.csv
index 34e16e0b964..5b5a9648737 100644
--- a/src/acc/opencl/smm/params/tune_multiply_PVC.csv
+++ b/src/acc/opencl/smm/params/tune_multiply_PVC.csv
@@ -2,7 +2,7 @@ DEVICE;TYPEID;M;N;K;S;GFLOPS;BS;BM;BN;BK;WS;WG;LU;NZ;AL;TB;TC;AP;AA;AB;AC
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;2;2;2;30000;0;8;2;1;1;1;-1;0;0;1;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;3;3;1;30000;0;8;3;1;1;1;1;1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;3;3;3;30000;0;8;3;1;1;1;-1;-2;0;0;0;1;0;2;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;4;30000;0;10;4;1;4;1;1;-1;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;4;30000;0;9;4;1;3;1;-1;1;0;1;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;5;30000;0;10;4;1;3;1;-1;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;7;30000;0;9;4;1;3;1;1;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;9;30000;0;8;4;1;2;1;-2;-2;0;0;0;1;0;2;2;0;0
@@ -13,7 +13,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;17;30000;0;8;4;1;1;1;-1;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;25;30000;0;8;4;1;1;1;-2;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;26;30000;0;8;4;1;1;1;-1;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;28;30000;0;8;4;1;1;1;1;1;0;0;0;1;0;2;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;32;30000;0;8;4;1;1;1;-2;-1;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;32;30000;0;8;4;1;1;1;-2;-2;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;4;45;30000;0;8;4;1;1;4;-1;-1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;5;4;30000;0;10;4;1;1;1;-2;0;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;5;5;30000;0;10;4;1;3;1;0;-2;0;0;0;1;0;0;0;0;0
@@ -71,7 +71,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;17;32;30000;0;8;4;1;1;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;4;30000;0;14;4;1;2;1;-2;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;5;30000;0;14;4;1;4;1;1;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;7;30000;0;14;4;1;3;25;1;1;0;0;0;1;0;2;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;9;30000;0;10;4;1;1;25;0;0;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;9;30000;0;10;4;1;1;25;1;0;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;13;30000;0;8;4;1;1;25;-1;0;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;25;30000;0;8;4;1;1;25;-2;0;0;1;0;1;1;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;4;25;26;30000;0;8;4;1;1;25;-2;0;0;0;0;1;1;0;0;0;0
@@ -131,16 +131,19 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;4;28;30000;0;8;5;1;1;5;-2;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;4;32;30000;0;8;5;1;1;1;-2;1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;4;45;30000;0;8;5;1;1;5;-2;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;4;30000;0;12;5;1;1;1;1;-2;0;0;0;1;0;0;2;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;5;30000;0;12;5;1;1;1;1;1;0;0;0;1;0;0;1;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;5;30000;0;10;5;1;4;1;-1;0;0;1;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;7;30000;0;8;5;1;1;1;-2;0;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;9;30000;0;8;5;1;1;1;-1;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;13;30000;0;8;5;1;1;1;-2;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;16;30000;0;8;5;1;1;1;-2;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;17;30000;0;8;5;1;1;1;-1;-1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;20;30000;0;8;5;1;1;5;1;0;0;1;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;24;30000;0;8;5;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;26;30000;0;8;5;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;28;30000;0;8;5;1;1;5;-2;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;5;32;30000;0;8;5;1;1;1;-2;-1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;6;20;30000;0;8;5;1;1;6;-1;-1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;6;32;30000;0;8;5;1;1;6;1;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;7;4;30000;0;12;5;1;1;1;-2;1;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;7;5;30000;0;12;5;1;1;1;-1;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;7;7;30000;0;10;5;1;1;1;-2;-1;0;0;0;1;0;0;2;0;0
@@ -156,6 +159,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;7;30000;0;14;5;1;1;1;-1;1;0;0;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;13;30000;0;10;5;1;1;1;1;0;0;1;1;1;1;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;16;30000;0;8;5;1;1;1;-2;-1;1;0;0;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;17;30000;0;8;5;1;1;1;-1;-1;1;1;0;1;1;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;20;30000;0;8;5;1;1;13;-2;-1;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;24;30000;0;8;5;1;1;1;-2;4;1;1;0;1;1;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;26;30000;0;8;5;1;1;1;-1;3;0;0;0;1;1;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;13;32;30000;0;8;5;1;1;1;-1;-2;0;0;0;1;1;2;2;0;0
@@ -171,6 +175,11 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;5;30000;0;16;5;1;1;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;13;30000;0;8;5;1;1;1;-2;0;1;0;0;1;1;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;17;30000;0;8;5;1;1;1;-1;-2;0;1;0;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;17;32;30000;0;8;5;1;1;1;-1;-2;0;0;1;1;1;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;5;30000;0;16;5;1;2;20;1;1;0;1;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;6;30000;0;14;5;1;1;20;-1;1;0;0;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;13;30000;0;14;5;1;1;20;-1;-2;0;1;0;1;0;2;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;20;30000;0;8;5;1;1;20;-1;-2;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;20;32;30000;0;8;5;1;1;20;1;-2;0;1;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;24;5;30000;0;16;5;1;1;1;1;1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;24;13;30000;0;15;5;1;1;1;0;1;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;24;16;30000;0;15;5;1;1;1;-1;1;0;0;0;1;0;2;2;0;0
@@ -185,7 +194,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;13;30000;0;8;5;1;1;25;0;-1;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;25;30000;0;8;5;1;1;25;0;-1;1;0;0;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;26;30000;0;8;5;1;1;25;-2;-2;0;0;0;1;1;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;28;30000;0;8;5;1;1;25;-1;0;0;0;0;1;1;2;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;32;30000;0;8;5;1;1;25;0;1;0;0;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;32;30000;0;8;5;1;1;25;1;0;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;25;45;30000;0;8;5;1;1;25;-1;0;0;0;0;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;26;4;30000;0;18;5;1;1;26;-2;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;26;5;30000;0;18;5;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0
@@ -212,10 +221,12 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;28;32;30000;0;8;5;1;1;28;-2;-2;1;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;28;45;30000;0;8;5;1;1;28;1;-2;0;1;0;1;1;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;4;30000;0;18;5;1;5;1;0;-1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;5;30000;0;17;5;1;1;1;1;-2;0;0;0;1;0;2;1;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;6;30000;0;18;5;1;2;32;0;-1;0;1;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;7;30000;0;15;5;1;1;32;-2;-2;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;9;30000;0;15;5;1;1;32;1;-1;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;13;30000;0;15;5;1;1;1;1;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;17;30000;0;8;5;1;1;1;-1;-2;0;1;0;1;0;2;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;20;30000;0;18;5;1;1;32;-1;1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;24;30000;0;8;5;1;1;32;-1;0;0;0;0;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;25;30000;0;8;5;1;1;32;-1;2;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;32;26;30000;0;8;5;1;1;32;-2;-1;0;0;0;1;0;0;0;0;0
@@ -232,15 +243,31 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;26;30000;0;6;5;1;1;45;-2;-1;0;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;28;30000;0;5;5;1;1;45;-1;-2;1;0;1;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;32;30000;0;4;5;1;1;45;-1;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;5;45;45;30000;0;9;5;1;1;45;-2;-1;0;0;0;1;1;0;2;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;5;20;30000;0;8;6;1;1;6;-2;-1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;5;32;30000;0;8;6;1;1;6;-2;-1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;6;30000;0;9;6;1;3;1;-2;0;0;1;0;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;7;30000;0;10;6;1;1;1;-1;-1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;8;30000;0;10;6;1;1;1;-2;-1;0;0;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;20;30000;0;10;6;1;1;6;-2;-2;0;1;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;6;32;30000;0;10;6;1;1;6;-1;-1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;7;6;30000;0;14;6;1;1;1;-1;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;7;7;30000;0;10;6;1;1;1;-2;0;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;7;8;30000;0;11;6;1;1;1;1;-2;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;8;6;30000;0;16;6;1;1;1;-1;0;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;8;7;30000;0;14;6;1;1;1;-1;-2;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;8;8;30000;0;13;6;1;1;1;-1;0;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;13;20;30000;0;10;6;1;1;13;1;1;0;0;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;13;32;30000;0;8;6;1;1;13;1;1;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;5;30000;0;18;6;1;1;20;-1;1;0;0;0;1;0;0;1;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;6;30000;0;15;6;1;1;20;-2;1;0;0;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;13;30000;0;14;6;1;1;20;-1;-1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;20;30000;0;8;6;1;1;20;-1;-1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;20;32;30000;0;8;6;1;1;20;0;-2;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;5;30000;0;20;6;1;6;32;1;-1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;6;30000;0;18;6;1;6;32;1;1;0;0;0;1;0;2;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;13;30000;0;15;6;1;1;32;1;1;0;0;0;1;0;2;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;20;30000;0;8;6;1;1;32;-1;1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;6;32;32;30000;0;8;6;1;1;32;1;-2;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;4;4;30000;0;14;7;1;1;1;-2;1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;4;5;30000;0;14;7;1;3;1;1;0;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;4;7;30000;0;11;7;1;1;1;-2;0;0;0;0;1;0;2;0;0;0
@@ -267,7 +294,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;6;8;30000;0;10;7;1;1;1;1;1;0;0;0;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;4;30000;0;14;7;1;1;1;1;-1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;5;30000;0;16;7;1;1;1;-1;0;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;6;30000;0;16;7;1;1;1;1;-1;0;0;0;1;0;0;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;7;30000;0;9;7;1;2;1;1;-2;0;1;0;1;1;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;7;30000;0;9;7;1;3;1;0;-2;0;1;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;8;30000;0;14;7;1;1;1;1;-1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;9;30000;0;13;7;1;1;1;-2;-1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;13;30000;0;10;7;1;1;1;-1;-2;0;0;0;1;0;0;2;0;0
@@ -279,7 +306,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;7;45;30000;0;8;7;1;1;7;-1;-1;0;0;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;8;6;30000;0;16;7;1;1;1;-2;-2;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;8;7;30000;0;14;7;1;1;1;1;1;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;8;8;30000;0;13;7;1;1;1;1;1;0;0;0;1;0;2;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;4;30000;0;16;7;1;1;1;-1;0;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;4;30000;0;16;7;1;2;1;-1;-2;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;5;30000;0;16;7;1;1;1;-2;-1;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;7;30000;0;17;7;1;1;1;-1;0;0;1;0;1;1;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;7;9;9;30000;0;14;7;1;1;9;-2;-2;0;0;0;1;0;2;0;0;0
@@ -405,8 +432,8 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;13;30000;0;15;9;1;2;28;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;25;30000;0;8;9;1;1;28;0;0;0;1;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;26;30000;0;10;9;1;1;28;-2;0;0;1;1;1;0;0;0;1;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;28;30000;0;12;9;1;1;28;0;0;0;0;0;1;0;2;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;32;30000;0;15;9;1;5;28;-2;-1;0;0;0;1;0;2;0;0;1
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;45;30000;0;19;9;1;1;28;1;0;0;0;0;1;0;0;0;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;32;30000;0;15;9;1;4;28;-2;-1;0;0;0;1;0;1;0;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;28;45;30000;0;15;9;1;1;28;1;0;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;32;4;30000;0;39;9;1;1;32;-1;-1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;32;5;30000;0;38;9;1;1;32;-1;-1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;9;32;7;30000;0;25;9;1;8;32;0;0;0;0;0;1;0;1;2;0;0
@@ -430,6 +457,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;10;15;4;30000;0;24;10;1;1;15;-1;-2;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;10;15;10;30000;0;16;10;1;3;15;-2;-1;0;0;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;10;15;15;30000;0;15;10;1;1;15;1;-1;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;12;12;12;30000;0;8;12;1;12;12;-2;-1;1;1;1;1;1;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;1;11;30000;0;8;13;1;1;13;-1;1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;4;4;30000;0;16;13;1;1;1;-2;-2;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;4;5;30000;0;13;13;1;1;1;-1;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;4;7;30000;0;13;13;1;2;1;1;0;0;0;0;1;0;0;2;0;0
@@ -448,12 +476,15 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;9;30000;0;13;13;1;3;13;-1;-1;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;13;30000;0;8;13;1;3;1;-1;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;16;30000;0;8;13;1;1;1;-2;0;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;17;30000;0;8;13;1;7;1;-2;-2;0;1;0;1;1;0;2;1;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;20;30000;0;8;13;1;1;13;-2;-1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;24;30000;0;8;13;1;1;1;-2;1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;25;30000;0;8;13;1;1;13;1;-2;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;26;30000;0;8;13;1;10;1;-1;-2;0;0;0;1;0;0;2;1;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;28;30000;0;8;13;1;1;13;-2;0;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;32;30000;0;8;13;1;1;1;-2;0;0;1;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;5;45;30000;0;8;13;1;1;13;-1;0;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;6;20;30000;0;8;13;1;1;13;-1;0;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;6;32;30000;0;8;13;1;1;13;-1;-1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;7;4;30000;0;17;13;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;7;5;30000;0;16;13;1;1;1;-2;0;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;7;7;30000;0;13;13;1;12;1;-2;1;0;0;0;1;0;2;2;0;0
@@ -481,11 +512,12 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;9;30000;0;24;13;1;1;13;-1;-2;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;13;30000;0;8;13;1;4;1;-1;-2;0;1;1;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;14;30000;0;18;13;1;1;13;-1;1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;17;30000;0;15;13;1;1;1;-1;1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;20;30000;0;24;13;1;1;13;-1;0;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;24;30000;0;12;13;1;1;13;-1;-1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;25;30000;0;8;13;1;1;13;-1;0;1;0;1;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;26;30000;0;14;13;1;1;13;-1;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;28;30000;0;8;13;1;1;13;-1;-1;1;1;0;1;1;0;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;32;30000;0;8;13;1;1;1;-1;0;1;0;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;32;30000;0;8;13;1;1;1;-2;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;13;45;30000;0;8;13;1;1;13;-2;0;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;14;13;30000;0;15;13;1;1;14;-2;1;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;14;14;30000;0;15;13;1;1;14;-2;1;0;0;0;1;0;2;0;0;0
@@ -498,6 +530,11 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;5;30000;0;19;13;1;10;1;0;-1;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;13;30000;0;9;13;1;9;1;1;-1;1;1;0;1;1;2;0;1;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;17;30000;0;8;13;1;1;1;-2;1;1;0;1;1;1;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;17;32;30000;0;8;13;1;1;1;1;-2;1;0;0;1;1;2;0;1;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;5;30000;0;17;13;1;8;20;0;1;0;0;0;1;0;1;1;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;6;30000;0;9;13;1;12;20;-1;-2;0;0;0;1;0;1;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;13;30000;0;11;13;1;9;20;1;-1;0;0;0;1;0;1;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;20;30000;0;17;8;1;6;20;0;-1;0;0;0;1;0;1;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;20;32;30000;0;15;8;1;1;20;-2;1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;24;5;30000;0;22;13;1;5;1;1;-1;1;1;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;24;13;30000;0;13;13;1;10;24;-1;0;0;0;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;24;24;30000;0;8;13;1;1;24;1;1;1;0;0;1;1;0;0;0;0
@@ -533,20 +570,22 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;9;30000;0;16;13;1;10;28;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;13;30000;0;13;13;1;11;28;1;1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;25;30000;0;15;13;1;1;28;-1;-2;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;26;30000;0;15;13;1;4;28;1;-2;0;0;0;1;0;1;0;0;1
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;28;30000;0;16;13;1;1;28;1;-1;0;0;0;1;0;0;0;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;28;30000;0;15;13;1;1;28;1;0;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;32;30000;0;15;13;1;1;28;1;-2;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;28;45;30000;0;15;13;1;1;28;0;1;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;4;30000;0;25;13;1;7;1;-1;0;0;0;0;1;0;1;1;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;5;30000;0;25;13;1;11;1;0;-2;0;0;0;1;0;1;1;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;6;30000;0;25;13;1;5;32;-2;0;0;0;0;1;0;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;7;30000;0;25;13;1;4;32;0;0;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;9;30000;0;16;13;1;7;32;-2;0;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;13;30000;0;13;13;1;13;1;-1;-1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;14;30000;0;13;13;1;12;32;-2;-1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;17;30000;0;8;13;1;1;1;-1;1;1;1;0;1;1;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;20;30000;0;15;13;1;1;32;-1;2;0;0;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;24;30000;0;16;13;1;1;32;-2;-2;1;1;0;1;0;2;2;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;25;30000;0;15;13;1;1;32;0;-2;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;26;30000;0;15;13;1;1;32;0;-2;0;0;0;1;0;0;0;0;1
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;28;30000;0;15;13;1;4;32;-2;1;0;0;0;1;0;2;2;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;28;30000;0;15;13;1;4;32;1;0;0;0;0;1;0;1;2;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;32;30000;0;15;13;1;1;1;-2;0;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;32;45;30000;0;15;13;1;1;32;-1;1;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;13;45;4;30000;0;30;13;1;11;45;1;1;0;0;0;1;0;0;1;0;0
@@ -625,7 +664,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;5;30000;0;35;16;1;1;1;-2;0;0;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;9;30000;0;25;16;1;1;1;-1;-2;1;1;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;14;30000;0;25;16;1;1;16;-1;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;16;30000;0;8;16;1;3;1;-2;1;0;1;0;1;0;0;2;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;22;30000;0;15;16;1;1;1;-1;0;0;0;0;1;0;0;0;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;22;30000;0;15;16;1;1;1;-1;1;0;0;0;1;0;0;2;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;29;30000;0;25;16;1;1;16;1;1;0;0;0;1;0;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;16;55;30000;0;8;16;1;1;16;-2;0;1;1;0;1;1;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;22;9;30000;0;21;16;1;9;1;-2;-1;1;1;1;1;1;0;0;1;0
@@ -636,7 +675,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;26;5;30000;0;20;16;1;15;1;-1;1;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;14;30000;0;15;8;1;13;29;-1;0;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;16;30000;0;15;8;1;13;29;1;-1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;29;30000;0;16;16;1;1;29;1;0;1;0;1;1;1;0;0;0;1
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;55;30000;0;15;16;1;1;29;1;-1;0;0;0;1;0;0;0;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;29;55;30000;0;15;16;1;1;29;1;0;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;55;16;30000;0;32;16;1;1;55;1;-1;1;1;1;1;0;2;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;55;29;30000;0;30;16;1;1;55;-1;0;1;0;0;1;1;2;2;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;16;55;55;30000;0;15;16;1;1;55;1;1;1;1;0;1;0;2;0;0;0
@@ -662,7 +701,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;13;30000;0;8;17;1;1;1;0;1;1;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;17;30000;0;8;17;1;4;1;-1;-1;0;1;1;1;1;2;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;32;30000;0;15;17;1;1;1;-2;1;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;17;35;30000;0;15;17;1;1;1;-2;-1;0;0;0;1;0;0;0;0;1
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;4;30000;0;25;17;1;7;1;-2;-2;0;0;0;1;0;1;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;4;30000;0;25;17;1;6;1;-2;-2;0;0;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;5;30000;0;28;17;1;8;1;-2;1;0;0;1;1;1;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;13;30000;0;11;17;1;1;0;-2;1;1;0;1;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;17;30000;0;15;17;1;10;1;-2;-2;0;0;0;1;0;2;2;0;1
@@ -671,7 +710,15 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;32;35;30000;0;15;17;1;1;1;-1;-1;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;35;17;30000;0;30;17;1;1;1;-2;1;0;0;0;1;0;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;35;32;30000;0;30;17;1;1;1;1;-2;0;1;0;1;1;0;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;17;35;35;30000;0;30;17;1;1;1;0;-2;1;1;1;1;1;2;0;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;5;30000;0;13;20;1;11;20;-2;1;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;6;30000;0;15;8;1;14;20;-2;-2;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;20;30000;0;8;20;1;1;1;-1;1;0;1;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;20;32;30000;0;15;20;1;1;20;0;3;0;0;0;1;0;2;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;5;30000;0;21;20;1;14;32;-2;1;0;0;0;1;0;2;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;6;30000;0;18;20;1;9;32;1;0;0;0;0;1;0;1;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;13;30000;0;18;8;1;7;32;-1;2;0;0;0;1;0;1;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;20;30000;0;16;20;1;4;32;1;2;0;0;0;1;0;1;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;20;32;32;30000;0;6;8;1;13;32;0;1;0;0;0;1;0;1;1;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;22;9;9;30000;0;14;22;1;1;1;-1;-1;1;1;0;1;1;0;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;22;9;16;30000;0;10;22;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;22;9;22;30000;0;8;22;1;1;1;-1;0;0;0;0;1;1;0;2;0;0
@@ -941,7 +988,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;13;30000;0;17;26;1;12;45;0;1;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;25;30000;0;10;8;1;1;45;1;-2;0;0;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;26;30000;0;5;26;1;1;45;-2;1;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;28;30000;0;1;26;1;19;45;-2;1;0;0;0;1;0;0;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;32;30000;0;17;8;1;5;45;-2;2;0;0;0;1;0;1;2;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;32;30000;0;17;16;1;24;45;-2;2;0;0;0;1;0;1;2;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;26;45;45;30000;0;5;26;1;1;45;1;1;0;1;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;28;4;4;30000;0;16;28;1;7;1;-2;4;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;28;4;5;30000;0;10;8;1;26;28;1;1;0;0;0;1;0;0;2;0;0
@@ -1063,7 +1110,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;32;32;30000;0;30;29;1;1;32;-2;5;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;32;55;30000;0;5;16;1;1;32;1;3;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;16;30000;0;15;29;1;12;55;0;1;0;0;0;1;0;1;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;29;30000;0;27;8;1;1;55;-2;4;0;0;0;1;0;2;0;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;32;30000;0;14;8;1;23;55;0;3;0;0;0;1;0;1;1;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;32;30000;0;14;16;1;24;55;0;3;0;0;0;1;0;1;1;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;29;55;55;30000;0;3;16;1;1;55;0;2;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;30;30;30;30000;0;8;30;1;1;1;-2;1;0;0;1;1;1;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;30;30;76;30000;0;8;30;1;1;30;-1;1;0;0;0;1;0;0;0;0;0
@@ -1087,7 +1134,7 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;9;30000;0;10;8;1;18;32;-2;1;0;
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;13;30000;0;8;32;1;26;1;-2;-2;0;1;0;1;0;0;2;1;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;17;30000;0;8;32;1;1;1;-1;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;24;30000;0;8;8;1;1;32;0;-1;0;0;0;1;0;2;1;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;25;30000;0;8;8;1;1;32;-1;-1;0;0;0;1;0;0;1;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;25;30000;0;8;8;1;1;32;-1;0;0;0;0;1;0;0;1;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;26;30000;0;8;8;1;1;32;-2;0;0;0;0;1;0;0;1;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;28;30000;0;8;8;1;1;32;-2;4;0;0;0;1;0;2;1;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;5;32;30000;0;8;32;1;1;1;-2;-2;0;0;0;1;0;0;0;0;0
@@ -1137,6 +1184,11 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;5;30000;0;29;8;1;31;1;1;1;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;13;30000;0;27;8;1;23;1;0;-1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;17;30000;0;15;32;1;1;1;-1;1;1;0;1;1;1;2;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;17;32;30000;0;7;8;1;1;1;-2;4;0;0;0;1;0;2;1;0;1
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;5;30000;0;40;8;1;29;32;-2;4;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;6;30000;0;31;32;1;4;32;-2;4;0;0;0;1;0;0;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;13;30000;0;27;32;1;2;32;-1;4;0;0;0;1;0;1;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;20;30000;0;12;32;1;1;32;1;4;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;20;32;30000;0;3;32;1;22;32;-2;4;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;22;9;30000;0;27;8;1;8;32;1;-1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;22;22;30000;0;12;8;1;1;1;1;4;1;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;22;32;30000;0;9;8;1;1;1;-1;-1;0;0;0;1;0;1;1;0;1
@@ -1184,11 +1236,13 @@ Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;29;32;30000;0;59;8;1;1;32;0;-1;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;29;55;30000;0;59;8;1;1;32;-1;4;0;1;1;1;0;1;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;4;30000;0;41;8;1;1;1;-2;1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;5;30000;0;30;8;1;13;1;1;1;0;0;0;1;0;1;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;6;30000;0;41;32;1;1;32;-2;4;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;7;30000;0;30;8;1;14;32;1;-2;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;9;30000;0;30;8;1;8;1;1;1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;13;30000;0;30;8;1;17;1;1;1;0;0;0;1;0;1;2;0;0
-Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;14;30000;0;30;8;1;13;32;0;0;0;0;0;1;0;1;2;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;14;30000;0;30;8;1;22;32;0;-1;0;0;0;1;0;1;2;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;17;30000;0;11;32;1;1;1;-1;1;0;0;0;1;0;0;0;0;0
+Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;20;30000;0;30;32;1;1;32;-1;4;0;0;0;1;0;2;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;22;30000;0;8;32;1;1;1;1;1;0;0;0;1;0;2;0;0;1
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;24;30000;0;8;32;1;1;1;-2;1;0;0;0;1;0;0;0;0;0
 Intel(R) Data Center GPU Max 1550 [0x0bd5];3;32;32;25;30000;0;30;8;1;1;32;1;1;0;0;0;1;0;2;0;0;0
diff --git a/src/acc/opencl/smm/tune_multiply.py b/src/acc/opencl/smm/tune_multiply.py
index 40d68c82f6f..7080b8ba29f 100755
--- a/src/acc/opencl/smm/tune_multiply.py
+++ b/src/acc/opencl/smm/tune_multiply.py
@@ -14,8 +14,7 @@
 from opentuner import MeasurementInterface
 from opentuner import Result
 from signal import signal, SIGINT
-import tempfile
-import shutil
+import tempfile  # , shutil
 import copy
 import json
 import glob
@@ -123,6 +122,10 @@ def __init__(self, args):
             device = re.search(devicepat, str(self.run_result["stderr"]))
             self.ndevices = int(device.group(1)) if device and device.group(1) else 0
             self.device = device.group(2) if device and device.group(2) else ""
+            # idevice: make certain resources/names unique on a per-rank basis
+            envrank = os.getenv("PMI_RANK", os.getenv("OMPI_COMM_WORLD_LOCAL_RANK"))
+            if envrank:
+                self.idevice = int(envrank) % self.ndevices
         elif self.args.update is not None and "" != self.args.update:
             self.device = self.args.update
         if self.run_result and 0 == self.run_result["returncode"]:
@@ -198,14 +201,15 @@ def __init__(self, args):
             and (self.size and 0 < self.size)
         ):  # setup database (DB)
             if self.args.database is None:  # adjust DB-location
-                envrank = os.getenv("PMI_RANK", os.getenv("OMPI_COMM_WORLD_LOCAL_RANK"))
                 tmpdir = os.path.join(tempfile.gettempdir(), "opentuner")
-                if envrank:
-                    self.idevice = int(envrank) % self.ndevices
+                if self.idevice is not None:
                     tmpdir += str(self.idevice)
-                if os.path.isdir(tmpdir):
-                    shutil.rmtree(tmpdir)
-                os.mkdir(tmpdir)
+                # if os.path.isdir(tmpdir):
+                # shutil.rmtree(tmpdir)
+                try:
+                    os.mkdir(tmpdir)
+                except:  # noqa: E722
+                    pass
                 self.args.database = "sqlite:///" + os.path.join(
                     tmpdir, "{}.db".format(os.getpid())
                 )
@@ -267,7 +271,7 @@ def launch(self, envs, check, nrep=None, verbose=None):
         if verbose is not None and 0 != int(verbose):
             msg = env_exe.replace("OPENCL_LIBSMM_SMM_", "")
             print("{}: {}".format("x".join(map(str, mnk)), msg))
-        env_std = "OMP_PROC_BIND=TRUE OPENCL_LIBSMM_SMM_S=0 NEO_CACHE_PERSISTENT=0"
+        env_std = "OMP_PROC_BIND=TRUE OPENCL_LIBSMM_SMM_S=0 NEO_CACHE_PERSISTENT=0 CUDA_CACHE_DISABLE=1"
         env_check = "CHECK={}".format(check if check is not None else 1)
         env_intrn = "{} {}".format(  # consider device-id
             "" if self.idevice is None else "ACC_OPENCL_DEVICE={}".format(self.idevice),
@@ -587,18 +591,15 @@ def save_final_config(self, configuration, final=True):
                 except:  # noqa: E722
                     pass
                 gflops = data["GFLOPS"] if data and "GFLOPS" in data else 0
-                filename = os.path.join(
-                    self.args.jsondir,
-                    (
-                        "{}-{}gflops.json".format(self.args.label, round(gflops))
-                        if 0 < gflops
-                        else "{}.json".format(self.args.label)
-                    ),
-                )
-                try:
-                    os.rename(filedot, filename)
-                except:  # noqa: E722
-                    pass
+                if 0 < gflops:
+                    filename = os.path.join(
+                        self.args.jsondir,
+                        "{}-{}gflops.json".format(self.args.label, round(gflops)),
+                    )
+                    try:
+                        os.rename(filedot, filename)
+                    except:  # noqa: E722
+                        pass
             # self.manipulator().save_to_file(config, filename)
             with open(filedot, "w") as file:
                 cfg = config
@@ -614,7 +615,7 @@ def save_final_config(self, configuration, final=True):
             mnk = "x".join(map(str, self.mnk))
             print("FAILED[{}] {}: {}".format(result, mnk, failed), flush=True)
             return
-        if final and os.path.exists(filedot):
+        if final and 0 < self.gflops and os.path.exists(filedot):
             filepattern = "{}-*.json".format(default_basename)
             fileglobs = glob.glob(
                 os.path.normpath(os.path.join(self.args.jsondir, filepattern))
@@ -905,8 +906,6 @@ def handle_sigint(self, signum, frame):
     # OPENCL_LIBSMM_SMM_xx=tune|enabled|on must be given to permit tuning)
     if os.getenv("OPENCL_LIBSMM_SMM_WS") not in default_enable_tune:
         os.environ["OPENCL_LIBSMM_SMM_WS"] = "{}".format(args.ws)
-    if os.getenv("OPENCL_LIBSMM_SMM_AL") not in default_enable_tune:
-        os.environ["OPENCL_LIBSMM_SMM_AL"] = "{}".format(args.al)
     # fix tunables according to level of tuning
     if 1 <= args.tlevel or 0 > args.tlevel:
         os.environ["OPENCL_LIBSMM_SMM_BM"] = "{}".format(args.bm)
@@ -932,7 +931,7 @@ def handle_sigint(self, signum, frame):
                     line = file.readline()
                     if not line:
                         break
-                    args.mnk = line.strip()
+                    args.mnk, args.label = line.strip(), ""
                     if args.mnk:
                         start(args)
                         print("")
@@ -944,6 +943,4 @@ def handle_sigint(self, signum, frame):
                     args.merge = -1
             start(args)
     else:
-        if not args.mnk:  # parse and sanitize kernel shape
-            args.mnk = default_mnk
         start(args)
diff --git a/src/base/dbcsr_machine.F b/src/base/dbcsr_machine.F
index 3428cf2ee19..8cebf095494 100644
--- a/src/base/dbcsr_machine.F
+++ b/src/base/dbcsr_machine.F
@@ -17,7 +17,7 @@ MODULE dbcsr_machine
       m_abort, m_chdir, m_flush_internal => m_flush, m_getarg, m_getcwd, m_getlog, m_getpid, &
       m_hostnm, m_iargc, m_memory, m_memory_details, m_memory_max, m_mov, m_procrun
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, OMP_GET_WTIME
+!$ USE OMP_LIB, ONLY: omp_get_wtime
 
    IMPLICIT NONE
 
@@ -100,7 +100,7 @@ FUNCTION m_walltime() RESULT(wt)
          wt = (REAL(count, KIND=dp) + REAL(cycles, KIND=dp)*(1.0_dp + REAL(count_max, KIND=dp))) &
               /REAL(count_rate, KIND=dp)
 !$    ELSE
-!$       wt = OMP_GET_WTIME()
+!$       wt = omp_get_wtime()
 !$    END IF
 #endif
    END FUNCTION m_walltime
diff --git a/src/base/dbcsr_machine_posix.f90 b/src/base/dbcsr_machine_posix.f90
index cf40fc13d88..0d66707e7a8 100644
--- a/src/base/dbcsr_machine_posix.f90
+++ b/src/base/dbcsr_machine_posix.f90
@@ -17,7 +17,7 @@
   PRIVATE
 
   PUBLIC :: m_flush, m_memory, &
-            m_hostnm, m_getcwd, m_getlog, m_getuid, m_getpid, m_getarg, &
+            m_hostnm, m_getcwd, m_getlog, m_getpid, m_getarg, &
             m_iargc, m_abort, m_chdir, m_mov, &
             m_memory_details, m_procrun
 
@@ -325,20 +325,6 @@ SUBROUTINE m_getlog(user)
 
   END SUBROUTINE m_getlog
 
-! *****************************************************************************
-  SUBROUTINE m_getuid(uid)
-     INTEGER, INTENT(OUT)                     :: uid
-
-     INTERFACE
-        FUNCTION getuid() BIND(C, name="getuid") RESULT(uid)
-           IMPORT
-           INTEGER(KIND=C_INT)              :: uid
-        END FUNCTION
-     END INTERFACE
-
-     uid = getuid()
-  END SUBROUTINE m_getuid
-
 ! *****************************************************************************
   SUBROUTINE m_getpid(pid)
      INTEGER, INTENT(OUT)                     :: pid
diff --git a/src/block/dbcsr_block_access.F b/src/block/dbcsr_block_access.F
index 71cd2076b07..6fd7319deea 100644
--- a/src/block/dbcsr_block_access.F
+++ b/src/block/dbcsr_block_access.F
@@ -62,7 +62,7 @@ MODULE dbcsr_block_access
                           real_8
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
 
    IMPLICIT NONE
 
diff --git a/src/block/dbcsr_block_operations.F b/src/block/dbcsr_block_operations.F
index fc7f8b51e4d..370327c20d5 100644
--- a/src/block/dbcsr_block_operations.F
+++ b/src/block/dbcsr_block_operations.F
@@ -30,7 +30,6 @@ MODULE dbcsr_block_operations
                           sp
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
    IMPLICIT NONE
 #if defined(__LIBXSMM) && TO_VERSION(1, 10) < TO_VERSION(LIBXSMM_CONFIG_VERSION_MAJOR, LIBXSMM_CONFIG_VERSION_MINOR)
 #  define __LIBXSMM_BLOCKOPS
diff --git a/src/block/dbcsr_iterator_operations.F b/src/block/dbcsr_iterator_operations.F
index 64afa010657..49626f1a52d 100644
--- a/src/block/dbcsr_iterator_operations.F
+++ b/src/block/dbcsr_iterator_operations.F
@@ -31,7 +31,7 @@ MODULE dbcsr_iterator_operations
                           real_8
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, omp_in_parallel
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads, omp_in_parallel
 
    IMPLICIT NONE
 
diff --git a/src/core/dbcsr_config.F b/src/core/dbcsr_config.F
index 0655a1cc81a..c6d06708390 100644
--- a/src/core/dbcsr_config.F
+++ b/src/core/dbcsr_config.F
@@ -177,9 +177,7 @@ MODULE dbcsr_config
       SET_PARAMETER_DEFAULT(USE_MEMPOOLS_CPU, CONF_PAR_LOGICAL, .FALSE.)
       SET_PARAMETER_DEFAULT(USE_MPI_ALLOCATOR, CONF_PAR_LOGICAL, .FALSE.)
       SET_PARAMETER_DEFAULT(TAS_SPLIT_FACTOR, CONF_PAR_REAL, 1.0_real_8)
-#if defined(__DBCSR_ACC_G2G)
-      SET_PARAMETER_DEFAULT(USE_ACC_G2G, CONF_PAR_LOGICAL, .TRUE.)
-#endif
+      SET_PARAMETER_DEFAULT(USE_ACC_G2G, CONF_PAR_LOGICAL, .FALSE.)
    END TYPE dbcsr_config_type
 
    TYPE(dbcsr_config_type), PROTECTED, SAVE :: dbcsr_cfg = dbcsr_config_type() ! defaults
@@ -414,11 +412,7 @@ SUBROUTINE dbcsr_set_config( &
       CALL dbcsr_cfg%accdrv_binning_binsize%set(accdrv_binning_binsize)
       CALL dbcsr_cfg%use_mempools_cpu%set(use_mempools_cpu)
       CALL dbcsr_cfg%tas_split_factor%set(tas_split_factor)
-#if defined(__DBCSR_ACC_G2G)
       CALL dbcsr_cfg%use_acc_g2g%set(use_acc_g2g)
-#else
-      MARK_USED(use_acc_g2g)
-#endif
 
       IF (0 == nthreads) THEN
          nthreads = 1
@@ -517,11 +511,7 @@ SUBROUTINE dbcsr_get_default_config( &
       IF (PRESENT(use_mempools_cpu)) use_mempools_cpu = dbcsr_cfg%use_mempools_cpu%defval
       IF (PRESENT(nstacks)) nstacks = dbcsr_cfg%n_stacks%defval
       IF (PRESENT(tas_split_factor)) tas_split_factor = dbcsr_cfg%tas_split_factor%defval
-#if defined(__DBCSR_ACC_G2G)
       IF (PRESENT(use_acc_g2g)) use_acc_g2g = dbcsr_cfg%use_acc_g2g%defval
-#else
-      MARK_USED(use_acc_g2g)
-#endif
 
    END SUBROUTINE dbcsr_get_default_config
 
@@ -650,11 +640,9 @@ SUBROUTINE dbcsr_print_config(unit_nr)
          WRITE (UNIT=unit_nr, FMT='(1X,A,T70,I11,A4)') &
             "DBCSR| ACC: Min. flop for processing", dbcsr_cfg%accdrv_min_flop_process%val, &
             dbcsr_cfg%accdrv_min_flop_process%print_source()
-#if defined(__DBCSR_ACC_G2G)
          WRITE (UNIT=unit_nr, FMT='(1X,A,T80,L1,A4)') &
             "DBCSR| ACC: Use G2G algorithm", dbcsr_cfg%use_acc_g2g%val, &
             dbcsr_cfg%use_acc_g2g%print_source()
-#endif
          IF (dbcsr_cfg%accdrv_stack_sort%val) THEN
             WRITE (UNIT=unit_nr, FMT='(1X,A,T70,I11,A4)') &
                "DBCSR| ACC: Min. flop for sorting", dbcsr_cfg%accdrv_min_flop_sort%val, &
diff --git a/src/core/dbcsr_lib.F b/src/core/dbcsr_lib.F
index 401abe931da..3f0a44a7c41 100644
--- a/src/core/dbcsr_lib.F
+++ b/src/core/dbcsr_lib.F
@@ -56,8 +56,6 @@ MODULE dbcsr_lib
 
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
-
 #if defined (__DBCSR_ACC)
    USE ISO_C_BINDING, ONLY: C_INT
 #endif
diff --git a/src/core/dbcsr_methods.F b/src/core/dbcsr_methods.F
index 862076e3fb2..70e7b9c064e 100644
--- a/src/core/dbcsr_methods.F
+++ b/src/core/dbcsr_methods.F
@@ -27,7 +27,6 @@ MODULE dbcsr_methods
       dbcsr_type_real_8, dbcsr_type_symmetric, dbcsr_work_type
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/core/dbcsr_types.F b/src/core/dbcsr_types.F
index 0c175889be8..fb232079d51 100644
--- a/src/core/dbcsr_types.F
+++ b/src/core/dbcsr_types.F
@@ -24,8 +24,6 @@ MODULE dbcsr_types
                           int_8
    USE dbcsr_mpiwrap, ONLY: mp_comm_type, mp_comm_null
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
    PRIVATE
 
diff --git a/src/data/dbcsr_data_methods_low.F b/src/data/dbcsr_data_methods_low.F
index 8e89bb267a8..041a6ef3edb 100644
--- a/src/data/dbcsr_data_methods_low.F
+++ b/src/data/dbcsr_data_methods_low.F
@@ -30,8 +30,6 @@ MODULE dbcsr_data_methods_low
                           real_8
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/data/dbcsr_data_operations.F b/src/data/dbcsr_data_operations.F
index 66e8f9c1bea..e5e9e1ba079 100644
--- a/src/data/dbcsr_data_operations.F
+++ b/src/data/dbcsr_data_operations.F
@@ -30,8 +30,6 @@ MODULE dbcsr_data_operations
                           dbcsr_type_real_8
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/data/dbcsr_data_types.F b/src/data/dbcsr_data_types.F
index cef186a0512..da18b6c4407 100644
--- a/src/data/dbcsr_data_types.F
+++ b/src/data/dbcsr_data_types.F
@@ -15,7 +15,7 @@ MODULE dbcsr_data_types
    USE dbcsr_kinds, ONLY: &
       dp, int_4, int_4_size, int_8, int_8_size, real_4, real_4_size, real_8, real_8_size
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, omp_lock_kind
+!$ USE OMP_LIB, ONLY: omp_lock_kind
 
 #include "base/dbcsr_base_uses.f90"
 
diff --git a/src/data/dbcsr_mem_methods.F b/src/data/dbcsr_mem_methods.F
index 66abf41f35a..a28a8fbef79 100644
--- a/src/data/dbcsr_mem_methods.F
+++ b/src/data/dbcsr_mem_methods.F
@@ -22,8 +22,7 @@ MODULE dbcsr_mem_methods
                                dbcsr_memtype_type
    USE dbcsr_kinds, ONLY: dp
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, &
-!$                    omp_set_lock, omp_unset_lock, omp_init_lock, omp_lock_kind, omp_destroy_lock
+!$ USE OMP_LIB, ONLY: omp_set_lock, omp_unset_lock, omp_init_lock, omp_destroy_lock
 
 #include "base/dbcsr_base_uses.f90"
 
diff --git a/src/data/dbcsr_ptr_util.F b/src/data/dbcsr_ptr_util.F
index eb9d25da071..030eaa16cf5 100644
--- a/src/data/dbcsr_ptr_util.F
+++ b/src/data/dbcsr_ptr_util.F
@@ -28,8 +28,6 @@ MODULE dbcsr_ptr_util
                             mp_deallocate
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
@@ -294,15 +292,9 @@ SUBROUTINE mem_copy_${nametype1}$ (dst, src, n)
         !! length of copy
          ${type1}$, DIMENSION(1:n), INTENT(OUT) :: dst
         !! destination memory
-         ${type1}$, DIMENSION(1:n), INTENT(IN) :: src
+         ${type1}$, DIMENSION(1:n), INTENT(IN)  :: src
         !! source memory
-#if !defined(__DBCSR_DISABLE_WORKSHARE)
-!$OMP     PARALLEL WORKSHARE DEFAULT(none) SHARED(dst,src)
-#endif
          dst(:) = src(:)
-#if !defined(__DBCSR_DISABLE_WORKSHARE)
-!$OMP     END PARALLEL WORKSHARE
-#endif
       END SUBROUTINE mem_copy_${nametype1}$
 
       SUBROUTINE mem_zero_${nametype1}$ (dst, n)
@@ -312,13 +304,7 @@ SUBROUTINE mem_zero_${nametype1}$ (dst, n)
         !! length of elements to zero
          ${type1}$, DIMENSION(1:n), INTENT(OUT) :: dst
         !! destination memory
-#if !defined(__DBCSR_DISABLE_WORKSHARE)
-!$OMP     PARALLEL WORKSHARE DEFAULT(none) SHARED(dst)
-#endif
          dst(:) = ${zero1}$
-#if !defined(__DBCSR_DISABLE_WORKSHARE)
-!$OMP     END PARALLEL WORKSHARE
-#endif
       END SUBROUTINE mem_zero_${nametype1}$
 
       SUBROUTINE mem_alloc_${nametype1}$ (mem, n, mem_type)
diff --git a/src/dbcsr_api.F b/src/dbcsr_api.F
index d28060357ea..a0a3faa4d10 100644
--- a/src/dbcsr_api.F
+++ b/src/dbcsr_api.F
@@ -140,7 +140,6 @@ MODULE dbcsr_api
                           real_4, &
                           real_8
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
 #include "base/dbcsr_base_uses.f90"
 
    IMPLICIT NONE
diff --git a/src/dist/dbcsr_dist_methods.F b/src/dist/dbcsr_dist_methods.F
index 8a44ac20f76..68b2ac0a38c 100644
--- a/src/dist/dbcsr_dist_methods.F
+++ b/src/dist/dbcsr_dist_methods.F
@@ -39,7 +39,7 @@ MODULE dbcsr_dist_methods
                           dbcsr_mp_obj
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, OMP_IN_PARALLEL
+!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_num_threads, omp_in_parallel
    IMPLICIT NONE
 
    PRIVATE
@@ -467,7 +467,7 @@ SUBROUTINE dbcsr_distribution_make_threads(dist, row_sizes)
 !   ---------------------------------------------------------------------------
 
       dist_p => dist
-!$    IF (.NOT. OMP_IN_PARALLEL()) THEN
+!$    IF (.NOT. omp_in_parallel()) THEN
 ! GCC 10.2 refused to build with DEFAULT(NONE) SHARED(dist_p, row_sizes) here:
 !$OMP        PARALLEL DEFAULT(SHARED)
 !$       CALL make_threads(dist_p, row_sizes=row_sizes)
diff --git a/src/dist/dbcsr_dist_operations.F b/src/dist/dbcsr_dist_operations.F
index 1a6972b5075..2ebf3fb65c1 100644
--- a/src/dist/dbcsr_dist_operations.F
+++ b/src/dist/dbcsr_dist_operations.F
@@ -36,8 +36,6 @@ MODULE dbcsr_dist_operations
                           dbcsr_type
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/dist/dbcsr_dist_util.F b/src/dist/dbcsr_dist_util.F
index 8070f2fd5a8..ea988a1fe99 100644
--- a/src/dist/dbcsr_dist_util.F
+++ b/src/dist/dbcsr_dist_util.F
@@ -47,7 +47,6 @@ MODULE dbcsr_dist_util
       dbcsr_type_complex_8, dbcsr_type_real_4, dbcsr_type_real_8
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
    IMPLICIT NONE
    PRIVATE
 
diff --git a/src/mm/dbcsr_acc_operations.F b/src/mm/dbcsr_acc_operations.F
index d9b9d10040f..776f7ada5f1 100644
--- a/src/mm/dbcsr_acc_operations.F
+++ b/src/mm/dbcsr_acc_operations.F
@@ -23,8 +23,6 @@ MODULE dbcsr_acc_operations
    USE dbcsr_kinds, ONLY: real_8, dp
    USE dbcsr_types, ONLY: dbcsr_type_real_8
 
-!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
-
 #include "base/dbcsr_base_uses.f90"
 
    IMPLICIT NONE
diff --git a/src/mm/dbcsr_mm.F b/src/mm/dbcsr_mm.F
index b3d475310e6..8dd9da1eaee 100644
--- a/src/mm/dbcsr_mm.F
+++ b/src/mm/dbcsr_mm.F
@@ -906,7 +906,6 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, &
                           flop=my_flop, keep_product_data=keep_product_data)
       ELSE
          data_type = dbcsr_get_data_type(product_matrix)
-#if defined (__DBCSR_ACC_G2G)
          IF (data_type .NE. dbcsr_type_real_8 .OR. (.NOT. dbcsr_cfg%use_acc_g2g%val)) THEN
             ! If G2G is enabled, norms have to be calculated on the GPU.
             ! Since the norms kernel expects only real_8 type data, we
@@ -921,12 +920,6 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, &
                                      filter_eps=filter_eps, &
                                      flop=my_flop, keep_product_data=keep_product_data)
          END IF
-#else
-         CALL multiply_cannon(m2s_left, m2s_right, product_matrix, &
-                              retain_sparsity=retain_sparsity, &
-                              filter_eps=filter_eps, &
-                              flop=my_flop, keep_product_data=keep_product_data)
-#endif
          CALL dbcsr_finalize(product_matrix, reshuffle=PRESENT(filter_eps) .AND. .NOT. keep_sparsity)
       END IF
       !
diff --git a/src/mm/dbcsr_mm_3d.F b/src/mm/dbcsr_mm_3d.F
index b25b360abd7..60f46754518 100644
--- a/src/mm/dbcsr_mm_3d.F
+++ b/src/mm/dbcsr_mm_3d.F
@@ -107,8 +107,9 @@ MODULE dbcsr_mm_3d
                                     dbcsr_work_destroy
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads, &
-!$                    omp_set_lock, omp_unset_lock, omp_init_lock, omp_lock_kind, omp_destroy_lock
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads, &
+!$                    omp_set_lock, omp_unset_lock, omp_init_lock, &
+!$                    omp_lock_kind, omp_destroy_lock
 
    IMPLICIT NONE
 
diff --git a/src/mm/dbcsr_mm_accdrv.F b/src/mm/dbcsr_mm_accdrv.F
index 5a4cc28f05b..8c69ad07dd0 100644
--- a/src/mm/dbcsr_mm_accdrv.F
+++ b/src/mm/dbcsr_mm_accdrv.F
@@ -531,6 +531,9 @@ SUBROUTINE dbcsr_mm_accdrv_process(this, left, right, params, stack_size, &
       IF (success) THEN
          CALL acc_event_record(stackbuf%calculated, stream=stackbuf%stream)
       ELSE
+         IF (dbcsr_cfg%use_acc_g2g%val) THEN
+            DBCSR_ABORT("MPI G2G requires all kernels to be evaluated on the GPU!")
+         END IF
          this%do_gpu_c_redux = .TRUE.
       END IF
 
diff --git a/src/mm/dbcsr_mm_cannon.F b/src/mm/dbcsr_mm_cannon.F
index 53b1f7faf7c..71357f53ded 100644
--- a/src/mm/dbcsr_mm_cannon.F
+++ b/src/mm/dbcsr_mm_cannon.F
@@ -123,7 +123,7 @@ MODULE dbcsr_mm_cannon
 
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
 
    IMPLICIT NONE
 
diff --git a/src/mm/dbcsr_mm_csr.F b/src/mm/dbcsr_mm_csr.F
index 0cf05b0a7e3..f84692b747f 100644
--- a/src/mm/dbcsr_mm_csr.F
+++ b/src/mm/dbcsr_mm_csr.F
@@ -45,7 +45,7 @@ MODULE dbcsr_mm_csr
                           dbcsr_work_type
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
 
    IMPLICIT NONE
 
diff --git a/src/mm/dbcsr_mm_dist_operations.F b/src/mm/dbcsr_mm_dist_operations.F
index 04159a34a83..7095bd18cd1 100644
--- a/src/mm/dbcsr_mm_dist_operations.F
+++ b/src/mm/dbcsr_mm_dist_operations.F
@@ -35,8 +35,6 @@ MODULE dbcsr_mm_dist_operations
       dbcsr_slot_nblkcols_local, dbcsr_slot_nblkrows_local, dbcsr_type
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/mm/dbcsr_mm_hostdrv.F b/src/mm/dbcsr_mm_hostdrv.F
index fa33c90021a..99eee4e3641 100644
--- a/src/mm/dbcsr_mm_hostdrv.F
+++ b/src/mm/dbcsr_mm_hostdrv.F
@@ -38,8 +38,6 @@ MODULE dbcsr_mm_hostdrv
                           sp
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/mm/dbcsr_mm_multrec.F b/src/mm/dbcsr_mm_multrec.F
index 161b0b69f6a..2d3152dab1a 100644
--- a/src/mm/dbcsr_mm_multrec.F
+++ b/src/mm/dbcsr_mm_multrec.F
@@ -43,7 +43,7 @@ MODULE dbcsr_mm_multrec
                           sp
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
 
    IMPLICIT NONE
 
diff --git a/src/mm/dbcsr_mm_sched.F b/src/mm/dbcsr_mm_sched.F
index db68526e398..19fd41de289 100644
--- a/src/mm/dbcsr_mm_sched.F
+++ b/src/mm/dbcsr_mm_sched.F
@@ -49,7 +49,7 @@ MODULE dbcsr_mm_sched
                           dbcsr_work_type
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
 
    IMPLICIT NONE
 
diff --git a/src/mm/dbcsr_multiply_api.F b/src/mm/dbcsr_multiply_api.F
index 44c0806bb99..ff0b95e2f5c 100644
--- a/src/mm/dbcsr_multiply_api.F
+++ b/src/mm/dbcsr_multiply_api.F
@@ -18,8 +18,6 @@ MODULE dbcsr_multiply_api
                           dbcsr_type_real_4, &
                           dbcsr_type_real_8
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
 #include "base/dbcsr_base_uses.f90"
 
    IMPLICIT NONE
diff --git a/src/mpi/dbcsr_mp_methods.F b/src/mpi/dbcsr_mp_methods.F
index 413e9afbf4c..7ef89d040fc 100644
--- a/src/mpi/dbcsr_mp_methods.F
+++ b/src/mpi/dbcsr_mp_methods.F
@@ -19,8 +19,6 @@ MODULE dbcsr_mp_methods
                             mp_comm_null, mp_comm_type
    USE dbcsr_types, ONLY: dbcsr_mp_obj
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
 #include "base/dbcsr_base_uses.f90"
 
    IMPLICIT NONE
diff --git a/src/mpi/dbcsr_mpiwrap.F b/src/mpi/dbcsr_mpiwrap.F
index f5393630eb9..a82edf17251 100644
--- a/src/mpi/dbcsr_mpiwrap.F
+++ b/src/mpi/dbcsr_mpiwrap.F
@@ -5182,13 +5182,7 @@ SUBROUTINE mp_rget_${nametype1}$v(base, source, win, win_data, myproc, disp, req
             MARK_USED(myproc)
 #endif
             IF (do_local_copy) THEN
-#if !defined(__DBCSR_DISABLE_WORKSHARE)
-!$OMP           PARALLEL WORKSHARE DEFAULT(none) SHARED(base,win_data,disp_aint,len)
-#endif
                base(:) = win_data(disp_aint + 1:disp_aint + len)
-#if !defined(__DBCSR_DISABLE_WORKSHARE)
-!$OMP           END PARALLEL WORKSHARE
-#endif
                request = mp_request_null
                ierr = 0
             ELSE
diff --git a/src/ops/dbcsr_io.F b/src/ops/dbcsr_io.F
index c5920543788..c024bb930bd 100644
--- a/src/ops/dbcsr_io.F
+++ b/src/ops/dbcsr_io.F
@@ -48,8 +48,6 @@ MODULE dbcsr_io
    USE dbcsr_work_operations, ONLY: dbcsr_create
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/ops/dbcsr_operations.F b/src/ops/dbcsr_operations.F
index e0a59a92e48..44112b8cb23 100644
--- a/src/ops/dbcsr_operations.F
+++ b/src/ops/dbcsr_operations.F
@@ -94,7 +94,7 @@ MODULE dbcsr_operations
                             mp_sum
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
+!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads
 
    IMPLICIT NONE
 
@@ -316,7 +316,6 @@ SUBROUTINE dbcsr_zero(matrix_a)
 
       CALL timeset(routineN, handle)
       SELECT CASE (dbcsr_get_data_type(matrix_a))
-#if defined(__DBCSR_DISABLE_WORKSHARE)
       CASE (dbcsr_type_complex_4)
          matrix_a%data_area%d%c_sp = (0.0, 0.0)
       CASE (dbcsr_type_complex_8)
@@ -325,24 +324,6 @@ SUBROUTINE dbcsr_zero(matrix_a)
          matrix_a%data_area%d%r_sp = 0.0
       CASE (dbcsr_type_real_8)
          matrix_a%data_area%d%r_dp = 0.0_dp
-#else
-      CASE (dbcsr_type_complex_4)
-!$OMP       PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a)
-         matrix_a%data_area%d%c_sp = (0.0, 0.0)
-!$OMP       END PARALLEL WORKSHARE
-      CASE (dbcsr_type_complex_8)
-!$OMP       PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a)
-         matrix_a%data_area%d%c_dp = (0.0_dp, 0.0_dp)
-!$OMP       END PARALLEL WORKSHARE
-      CASE (dbcsr_type_real_4)
-!$OMP       PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a)
-         matrix_a%data_area%d%r_sp = 0.0
-!$OMP       END PARALLEL WORKSHARE
-      CASE (dbcsr_type_real_8)
-!$OMP       PARALLEL WORKSHARE DEFAULT(NONE), SHARED(matrix_a)
-         matrix_a%data_area%d%r_dp = 0.0_dp
-!$OMP       END PARALLEL WORKSHARE
-#endif
       END SELECT
       CALL timestop(handle)
    END SUBROUTINE dbcsr_zero
diff --git a/src/ops/dbcsr_test_methods.F b/src/ops/dbcsr_test_methods.F
index bc081615913..ef7e5cd3528 100644
--- a/src/ops/dbcsr_test_methods.F
+++ b/src/ops/dbcsr_test_methods.F
@@ -60,8 +60,6 @@ MODULE dbcsr_test_methods
                                     dbcsr_work_create
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/ops/dbcsr_tests.F b/src/ops/dbcsr_tests.F
index dc498862459..c3f69ff8012 100644
--- a/src/ops/dbcsr_tests.F
+++ b/src/ops/dbcsr_tests.F
@@ -55,8 +55,6 @@ MODULE dbcsr_tests
    USE dbcsr_work_operations, ONLY: dbcsr_create
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/ops/dbcsr_transformations.F b/src/ops/dbcsr_transformations.F
index ff4e060cf00..e6fc2a17806 100644
--- a/src/ops/dbcsr_transformations.F
+++ b/src/ops/dbcsr_transformations.F
@@ -91,8 +91,6 @@ MODULE dbcsr_transformations
                                     dbcsr_work_create
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
    PRIVATE
 
diff --git a/src/tensors/dbcsr_array_list_methods.F b/src/tensors/dbcsr_array_list_methods.F
index a80277abf77..1077e172488 100644
--- a/src/tensors/dbcsr_array_list_methods.F
+++ b/src/tensors/dbcsr_array_list_methods.F
@@ -19,6 +19,13 @@ MODULE dbcsr_array_list_methods
    USE dbcsr_allocate_wrap, ONLY: allocate_any
 
 #include "base/dbcsr_base_uses.f90"
+#if TO_VERSION(1, 11) <= TO_VERSION(LIBXSMM_CONFIG_VERSION_MAJOR, LIBXSMM_CONFIG_VERSION_MINOR)
+   USE libxsmm, ONLY: libxsmm_diff
+#  define PURE_ARRAY_EQ
+#else
+#  define PURE_ARRAY_EQ PURE
+#endif
+
    IMPLICIT NONE
    PRIVATE
    CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_array_list_methods'
@@ -275,15 +282,18 @@ FUNCTION check_equal(list1, list2)
       check_equal = array_eq_i(list1%col_data, list2%col_data) .AND. array_eq_i(list1%ptr, list2%ptr)
    END FUNCTION
 
-   PURE FUNCTION array_eq_i(arr1, arr2)
+   PURE_ARRAY_EQ FUNCTION array_eq_i(arr1, arr2)
       !! check whether two arrays are equal
       INTEGER, INTENT(IN), DIMENSION(:) :: arr1
       INTEGER, INTENT(IN), DIMENSION(:) :: arr2
       LOGICAL                           :: array_eq_i
 
+#if TO_VERSION(1, 11) <= TO_VERSION(LIBXSMM_CONFIG_VERSION_MAJOR, LIBXSMM_CONFIG_VERSION_MINOR)
+      array_eq_i = .NOT. libxsmm_diff(arr1, arr2)
+#else
       array_eq_i = .FALSE.
       IF (SIZE(arr1) .EQ. SIZE(arr2)) array_eq_i = ALL(arr1 == arr2)
-
+#endif
    END FUNCTION
 
 END MODULE dbcsr_array_list_methods
diff --git a/src/tensors/dbcsr_tensor_types.F b/src/tensors/dbcsr_tensor_types.F
index c8eb1953355..17147fb49fa 100644
--- a/src/tensors/dbcsr_tensor_types.F
+++ b/src/tensors/dbcsr_tensor_types.F
@@ -698,18 +698,6 @@ SUBROUTINE dbcsr_t_distribution_new_expert(dist, pgrid, map1_2d, map2_2d, ${varl
             ALLOCATE (dist%refcount)
             dist%refcount = 1
             CALL timestop(handle)
-
-         CONTAINS
-            PURE FUNCTION array_eq_i(arr1, arr2)
-               INTEGER, INTENT(IN), DIMENSION(:) :: arr1
-               INTEGER, INTENT(IN), DIMENSION(:) :: arr2
-               LOGICAL                           :: array_eq_i
-
-               array_eq_i = .FALSE.
-               IF (SIZE(arr1) .EQ. SIZE(arr2)) array_eq_i = ALL(arr1 == arr2)
-
-            END FUNCTION
-
          END SUBROUTINE
 
          SUBROUTINE dbcsr_t_distribution_destroy(dist)
diff --git a/src/utils/dbcsr_toollib.F b/src/utils/dbcsr_toollib.F
index 5e6da8cf939..e084f683819 100644
--- a/src/utils/dbcsr_toollib.F
+++ b/src/utils/dbcsr_toollib.F
@@ -19,8 +19,6 @@ MODULE dbcsr_toollib
                           real_8
 #include "base/dbcsr_base_uses.f90"
 
-!$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads
-
    IMPLICIT NONE
 
    PRIVATE
diff --git a/src/work/dbcsr_work_operations.F b/src/work/dbcsr_work_operations.F
index e55d0efae2e..faff0aee208 100644
--- a/src/work/dbcsr_work_operations.F
+++ b/src/work/dbcsr_work_operations.F
@@ -916,7 +916,7 @@ SUBROUTINE dbcsr_finalize(matrix, reshuffle)
          ! built/modified in a parallel environment
          nwms = SIZE(matrix%wms)
          spawn = .FALSE.
-!$       IF (.NOT. OMP_IN_PARALLEL()) THEN
+!$       IF (.NOT. omp_in_parallel()) THEN
 !$          IF (nwms .GT. 1) spawn = .TRUE.
 !$       END IF
          IF (spawn) THEN
diff --git a/tools/docker/Dockerfile.build-env-ubuntu b/tools/docker/Dockerfile.build-env-ubuntu
index 73c935dec18..0ace1bd728a 100644
--- a/tools/docker/Dockerfile.build-env-ubuntu
+++ b/tools/docker/Dockerfile.build-env-ubuntu
@@ -55,12 +55,12 @@ RUN set -ex ; \
         git-archive-all \
         ;
 
-ARG libxsmm_version=1.17
+ARG libxsmm_version=488aa88f2a9825e9f92a0cfc773c1aedf019f88a
 
 RUN set -ex ; \
-    curl -LsS https://github.com/hfp/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \
+    curl -LsS https://github.com/libxsmm/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \
     ln -s libxsmm-${libxsmm_version} /opt/libxsmm ; \
-    make -j -C /opt/libxsmm MALLOC=0
+    make -j -C /opt/libxsmm WRAP=0
 
 ENV PKG_CONFIG_PATH="/opt/libxsmm/lib:${PKG_CONFIG_PATH}"
 
diff --git a/tools/docker/Dockerfile.build-env-ubuntu-cuda b/tools/docker/Dockerfile.build-env-ubuntu-cuda
index bdcc7bc109d..5dadec16251 100644
--- a/tools/docker/Dockerfile.build-env-ubuntu-cuda
+++ b/tools/docker/Dockerfile.build-env-ubuntu-cuda
@@ -46,12 +46,12 @@ RUN set -ex ; \
         git-archive-all \
         ;
 
-ARG libxsmm_version=1.17
+ARG libxsmm_version=488aa88f2a9825e9f92a0cfc773c1aedf019f88a
 
 RUN set -ex ; \
-    curl -LsS https://github.com/hfp/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \
+    curl -LsS https://github.com/libxsmm/libxsmm/archive/${libxsmm_version}.tar.gz | tar -xz -C /opt ; \
     ln -s libxsmm-${libxsmm_version} /opt/libxsmm ; \
-    make -j -C /opt/libxsmm MALLOC=0
+    make -j -C /opt/libxsmm WRAP=0
 
 ENV PKG_CONFIG_PATH="/opt/libxsmm/lib:${PKG_CONFIG_PATH}"