Skip to content

Commit

Permalink
Merge branch 'release-2.8.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
alazzaro committed Dec 11, 2024
2 parents 0f7c1fb + 985f189 commit 2cdbf13
Show file tree
Hide file tree
Showing 93 changed files with 830 additions and 5,439 deletions.
2 changes: 1 addition & 1 deletion .ci/daint.cscs.ch/ocl.build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ if [ ! -d "${HOME}/libxsmm" ]; then
fi
cd "${HOME}/libxsmm"
git fetch
git checkout d009b33e8742a93c9e1549323587fb6197451294
git checkout 488aa88f2a9825e9f92a0cfc773c1aedf019f88a
make -j
cd ..

Expand Down
15 changes: 8 additions & 7 deletions .github/workflows/testing-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,16 +74,16 @@ jobs:
mv build/coverage.info build/coverage-Linux-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-cpu.info
- name: Upload coverage data
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: coverage-data
name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }}
path: build/coverage-*.info

- name: Upload coverage data (generated files)
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
if: matrix.use_mpi == 'MPI=ON' && matrix.use_openmp == 'OPENMP=ON' && matrix.use_smm == 'SMM=blas' && matrix.mpi_suffix == 'openmpi'
with:
name: coverage-data
name: coverage-data-${{ matrix.use_mpi }}-${{ matrix.use_openmp }}-${{ matrix.use_smm }}-${{ matrix.mpi_suffix }}-generated-files
path: |
build/src/dbcsr.h
build/src/tensors/dbcsr_tensor.h
Expand Down Expand Up @@ -200,9 +200,10 @@ jobs:
- uses: actions/checkout@v4

- name: Download coverage data
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4.1.7
with:
name: coverage-data
pattern: coverage-data-*
merge-multiple: true

- name: Combine coverage
run: |
Expand All @@ -213,7 +214,7 @@ jobs:
lcov --summary merged.info
- name: Upload merged HTML report
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: html-report
path: htmlcov
Expand Down
11 changes: 6 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ fail_fast: false
minimum_pre_commit_version: 3.2.0
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: 'v0.5.4'
rev: 'v0.8.2'
hooks:
- id: ruff
args: [ --fix, --exit-non-zero-on-fix ]
Expand All @@ -15,19 +15,19 @@ repos:
.cp2k/.*|
)$
- repo: https://github.com/psf/black
rev: 24.4.2
rev: 24.10.0
hooks:
- id: black
name: Reformat Python files with the black code formatter
files: '^.*(/PACKAGE)|(\.py)$'
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: check-ast
- id: check-yaml
- id: check-symlinks
- id: trailing-whitespace
- repo: https://github.com/pseewald/fprettify
- repo: https://github.com/fortran-lang/fprettify
rev: v0.3.7
hooks:
- id: fprettify
Expand Down Expand Up @@ -65,4 +65,5 @@ repos:
language: python
files: \.(c|cc|cxx|cpp|cl|frag|glsl|h|hpp|hxx|ih|ispc|ipp|java|js|m|mm|proto|textproto|vert)$
args: ['-i', '-fallback-style=none', '--style=file']
additional_dependencies: ['clang-format']
# specify version since clang-format is not stable version-to-version
additional_dependencies: ['clang-format~=19.1.0']
8 changes: 0 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,7 @@ set_property(CACHE WITH_GPU PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES}

option(WITH_CUDA_PROFILING "Enable profiling within CUDA" OFF)
option(WITH_HIP_PROFILING "Enable profiling within HIP" OFF)
option(WITH_G2G "Enable GPU aware MPI within CUDA/HIP backends" OFF)

if (WITH_G2G AND ((NOT USE_ACCEL) OR ((NOT USE_ACCEL MATCHES "cuda")
AND (NOT USE_ACCEL MATCHES "hip"))))
message(
FATAL_ERROR "GPU aware MPI can only be enabled for HIP/CUDA GPU backends")
endif ()
# =================================================================================================
# LANGUAGES AND TESTING
enable_language(Fortran)
Expand Down Expand Up @@ -274,7 +268,6 @@ if (USE_ACCEL MATCHES "cuda")
message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS})
message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER})
message(STATUS "GPU profiling enabled: " ${WITH_CUDA_PROFILING})
message(STATUS "GPU aware MPI enabled: " ${WITH_G2G})
endif ()

if (USE_ACCEL MATCHES "hip")
Expand Down Expand Up @@ -319,7 +312,6 @@ if (USE_ACCEL MATCHES "hip")
message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS})
message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER})
message(STATUS "GPU profiling enabled: " ${WITH_HIP_PROFILING})
message(STATUS "GPU aware MPI enabled: " ${WITH_G2G})

# =================================== BLAS on GPU backend
find_package(hipblas CONFIG REQUIRED HINTS ${ROCM_PATH})
Expand Down
4 changes: 2 additions & 2 deletions VERSION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
MAJOR = 2
MINOR = 7
MINOR = 8
PATCH = 0
# A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
# it is considered Development version.
DATE = 2024-07-29
DATE = 2024-12-11


3 changes: 0 additions & 3 deletions cmake/CompilerConfiguration.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
if ((NOT (USE_MPI)) OR (NOT ("${MPI_Fortran_LIBRARY_VERSION_STRING}" MATCHES "Open MPI")))
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=leak")
endif ()
if (USE_ACCEL MATCHES "hip" AND hip_VERSION GREATER_EQUAL 6.0.0) # Remove deprecated function error with ROCm v6+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations")
endif ()
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -funroll-loops")
set(CMAKE_CXX_FLAGS_COVERAGE "-O0 -g --coverage")
Expand Down
1 change: 0 additions & 1 deletion docs/guide/2-user-guide/1-installation/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ make
-DUSE_ACCEL=<opencl|cuda|hip>
-DWITH_CUDA_PROFILING=<OFF|ON>
-DWITH_HIP_PROFILING=<OFF|ON>
-DWITH_G2G=<OFF|ON>
-DWITH_C_API=<ON|OFF>
-DWITH_EXAMPLES=<ON|OFF>
-DWITH_GPU=<P100|K20X|K40|K80|V100|Mi50|Mi100|Mi250>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,3 @@ Assumed square matrix with 20x20 matrix with 5x5 blocks and a 2x2 processor grid
| `__CUDA_PROFILING` | To turn on Nvidia Tools Extensions. It requires to link `-lnvToolsExt` | Fortran, C, C++ |
| `__CUDA` | Enable CUDA acceleration | C, C++ |
| `__HIP` | Enable HIP acceleration | C, C++ |
| `__DBCSR_ACC_G2G` | Enable GPU Aware MPI in CUDA and HIP backends | Fortran, C, C++ |
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,3 @@ The batched matrix-matrix multiplication kernels are templated on:
The batched transpose kernels are templated on:

* the characteristic dimensions of the transpose: `m, n`

## Predictive parameters

The input features for the predictive models can be 'raw' parameters (left-most-column in the figure below), or hand-engineered features 'derived' from the raw features (matrix sizes, launch parameters and resource usage estimations).

![libsmm_acc_predictive_modeling_features](../../../../../media/images/libsmm_acc_predictive_modeling_features.png)

This file was deleted.

This file was deleted.

3 changes: 0 additions & 3 deletions docs/media/images/README.md

This file was deleted.

Binary file not shown.

This file was deleted.

12 changes: 0 additions & 12 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -284,18 +284,6 @@ if (USE_ACCEL)
$<$<BOOL:${WITH_HIP_PROFILING}>:roctx64>
$<$<BOOL:${WITH_HIP_PROFILING}>:roctracer64>
$<$<STREQUAL:${USE_ACCEL},opencl>:OpenCL::OpenCL>)

if (WITH_G2G)
target_compile_definitions(
dbcsr
PRIVATE __DBCSR_ACC_G2G
$<$<STREQUAL:${USE_ACCEL},cuda>:__CUDA>
$<$<STREQUAL:${USE_ACCEL},cuda>:ARCH_NUMBER=${ACC_ARCH_NUMBER}>
$<$<STREQUAL:${USE_ACCEL},hip>:__HIP>
$<$<STREQUAL:${USE_ACCEL},hip>:ARCH_NUMBER=${ACC_ARCH_NUMBER}>
$<$<BOOL:${WITH_CUDA_PROFILING}>:__CUDA_PROFILING>
$<$<BOOL:${WITH_HIP_PROFILING}>:__HIP_PROFILING>)
endif ()
endif ()

# =================================================================================================
Expand Down
29 changes: 17 additions & 12 deletions src/acc/acc_bench_smm.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,21 +222,25 @@ int main(int argc, char* argv[]) {
#endif
CHECK(libsmm_acc_init(), &result, check); /* note: libsmm_acc_init() may imply acc_init() */
if (EXIT_SUCCESS == result) {
const char* const env_device = getenv("DEVICE");
const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
int ndevices = 0;
result = c_dbcsr_acc_get_ndevices(&ndevices);
if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) {
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
}
else {
if (0 >= ndevices) {
fprintf(stderr, "ERROR: No ACC-device found!\n");
if (EXIT_SUCCESS == result && 0 < ndevices) {
const char* const env_device = getenv("DEVICE");
const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
const int rank = (NULL != env_rank ? atoi(env_rank) : -1);
int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1);
result = c_dbcsr_acc_set_active_device(device);
if (EXIT_SUCCESS == result) {
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
}
else {
fprintf(stderr, "ERROR: Failed to activate device %i of %i!\n", device, ndevices);
fprintf(stderr, "ERROR: Failed to activate device!\n");
}
result = EXIT_FAILURE;
}
else {
fprintf(stderr, "ERROR: No ACC-device found!\n");
if (EXIT_SUCCESS == result) result = EXIT_FAILURE;
}
if (EXIT_SUCCESS == result) {
rnd = (int*)malloc(sizeof(int) * NRAND);
Expand Down Expand Up @@ -280,7 +284,7 @@ int main(int argc, char* argv[]) {
#if defined(USE_LIBXSMM)
libxsmm_timer_tickint start;
int print_offset = 0;
char print_buffer[1024];
char print_buffer[1024] = "";
# if defined(__OPENCL)
const char* const env_smm_repeat = getenv("SMM_NREPEAT");
const int smm_nrepeat = (NULL == env_smm_repeat ? 1 : MAX(atoi(env_smm_repeat), 1));
Expand Down Expand Up @@ -497,7 +501,7 @@ int main(int argc, char* argv[]) {
if (maxdiff < epsilon && NULL != file) maxdiff = epsilon;
if (0 < epsilon) {
if (LIBXSMM_NOTNAN(diff.v_tst)) {
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, fabs(diff.v_ref - diff.v_tst));
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, diff.linf_abs);
}
else {
PRINTF(" (%g)\n", diff.v_tst);
Expand All @@ -508,6 +512,7 @@ int main(int argc, char* argv[]) {
}
if (0 < check && check < epsilon) result = EXIT_FAILURE;
}
else fprintf(stderr, "ERROR: failed to validate!\n");
}
# endif
}
Expand Down
50 changes: 23 additions & 27 deletions src/acc/acc_bench_trans.c
Original file line number Diff line number Diff line change
Expand Up @@ -106,52 +106,48 @@ int main(int argc, char* argv[]) {
#else
const int warmup = 0;
#endif
const char* const env_device = getenv("DEVICE");
const int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
int *stack_hst = NULL, *stack_dev = NULL;
ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL;
int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n;
int result = EXIT_SUCCESS, mm = m, nn = n, r, i;
void* stream = NULL;
#if defined(USE_LIBXSMM)
libxsmm_timer_tickint start;
double duration;
#endif
assert(m <= (mn / n) && 0 == (mn % n));
if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) {
fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n");
result = EXIT_FAILURE;
}
CHECK(c_dbcsr_acc_init(), &result);
/* note: libsmm_acc_init() may imply acc_init() */
CHECK(libsmm_acc_init(), &result);
if (EXIT_SUCCESS == result) {
int ndevices = 0;
result = c_dbcsr_acc_get_ndevices(&ndevices);
if (0 < ndevices && (0 == device || EXIT_SUCCESS == c_dbcsr_acc_set_active_device(device))) {
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
}
else {
if (0 >= ndevices) {
fprintf(stderr, "No ACC-device found!\n");
if (EXIT_SUCCESS == result && 0 < ndevices) {
const char* const env_device = getenv("DEVICE");
const char* const env_rank = (NULL != getenv("PMI_RANK") ? getenv("PMI_RANK") : getenv("OMPI_COMM_WORLD_LOCAL_RANK"));
const int rank = (NULL != env_rank ? atoi(env_rank) : -1);
int device = ((NULL == env_device || '\0' == *env_device) ? 0 : atoi(env_device));
device = ((0 <= device && device < ndevices) ? (0 <= rank ? (rank % ndevices) : device) : -1);
result = c_dbcsr_acc_set_active_device(device);
if (EXIT_SUCCESS == result) {
printf("Activated device%i (ndevices=%i)\n", device, ndevices);
printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
}
else {
fprintf(stderr, "Failed to activate device %i of %i!\n", device, ndevices);
fprintf(stderr, "ERROR: Failed to activate device!\n");
}
#if !defined(__CUDA)
CHECK(libsmm_acc_finalize(), NULL);
#endif
CHECK(c_dbcsr_acc_finalize(), NULL);
return result;
}
else {
fprintf(stderr, "ERROR: No ACC-device found!\n");
if (EXIT_SUCCESS == result) result = EXIT_FAILURE;
}
}
else {
fprintf(stderr, "ACC initialization failed!\n");
#if !defined(__CUDA)
CHECK(libsmm_acc_finalize(), NULL);
#endif
CHECK(c_dbcsr_acc_finalize(), NULL);
return result;
}
printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
printf("typename (id=%i): %s\n", DBCSR_TYPE(ELEM_TYPE), DBCSR_STRINGIFY(ELEM_TYPE));
if (MAX_KERNEL_DIM < m || MAX_KERNEL_DIM < n) {
fprintf(stderr, "Matrix shape exceeds MAX_KERNEL_DIM!\n");
result = EXIT_FAILURE;
}
#if defined(PRIORITY)
CHECK(c_dbcsr_acc_stream_priority_range(&priomin, &priomax), &result);
Expand Down Expand Up @@ -259,7 +255,7 @@ int main(int argc, char* argv[]) {
CHECK(c_dbcsr_acc_finalize(), NULL);
if (EXIT_SUCCESS != result) {
if (-1 != result) {
fprintf(stderr, "FAILED\n");
fprintf(stderr, "\nFAILED\n\n");
}
else {
fprintf(stderr, "Kernel not suitable!\n");
Expand Down
Loading

0 comments on commit 2cdbf13

Please sign in to comment.