Skip to content

Commit

Permalink
sve - initial SVE backend framework
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Mar 8, 2023
1 parent 404fb0e commit 2ddda2a
Show file tree
Hide file tree
Showing 8 changed files with 428 additions and 2 deletions.
17 changes: 15 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ blocked.c := $(sort $(wildcard backends/blocked/*.c))
ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
opt.c := $(sort $(wildcard backends/opt/*.c))
avx.c := $(sort $(wildcard backends/avx/*.c))
sve.c := $(sort $(wildcard backends/sve/*.c))
xsmm.c := $(sort $(wildcard backends/xsmm/*.c))
cuda.c := $(sort $(wildcard backends/cuda/*.c))
cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp))
Expand Down Expand Up @@ -301,6 +302,7 @@ info:
$(info ------------------------------------)
$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
$(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
$(info SVE_STATUS = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
$(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
$(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
$(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
Expand Down Expand Up @@ -341,7 +343,7 @@ ifeq ($(MEMCHK),1)
BACKENDS_MAKE += $(MEMCHK_BACKENDS)
endif

# AVX Backed
# AVX Backends
AVX_STATUS = Disabled
AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+avx,-mavx)
AVX := $(filter $(AVX_FLAG),$(shell $(CC) $(CFLAGS:-M%=) -v -E -x c /dev/null 2>&1))
Expand All @@ -352,6 +354,17 @@ ifneq ($(AVX),)
BACKENDS_MAKE += $(AVX_BACKENDS)
endif

# SVE Backends
SVE_STATUS = Disabled
SVE_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
SVE ?=
SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
ifneq ($(SVE),)
SVE_STATUS = Enabled
libceed.c += $(sve.c)
BACKENDS_MAKE += $(SVE_BACKENDS)
endif

# Collect list of libraries and paths for use in linking and pkg-config
PKG_LIBS =
# Stubs that will not be RPATH'd
Expand Down Expand Up @@ -434,7 +447,7 @@ ifneq ($(HIP_LIB_DIR),)
BACKENDS_MAKE += $(HIP_BACKENDS)
endif

# MAGMA Backend
# MAGMA Backends
ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),)
MAGMA_ARCH=$(shell nm -g $(MAGMA_DIR)/lib/libmagma.* | grep -c "hipblas")
ifeq ($(MAGMA_ARCH), 0) #CUDA MAGMA
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ There are multiple supported backends, which can be selected at runtime in the e
| `/cpu/self/opt/blocked` | Blocked optimized C implementation | Yes |
| `/cpu/self/avx/serial` | Serial AVX implementation | Yes |
| `/cpu/self/avx/blocked` | Blocked AVX implementation | Yes |
| `/cpu/self/sve/serial` | Serial SVE implementation | Yes |
| `/cpu/self/sve/blocked` | Blocked SVE implementation | Yes |
||
| **CPU Valgrind** |
| `/cpu/self/memcheck/*` | Memcheck backends, undefined value checks | Yes |
Expand Down Expand Up @@ -180,6 +182,8 @@ The `/cpu/self/opt/*` backends are written in pure C and use partial e-vectors t

The `/cpu/self/avx/*` backends rely upon AVX instructions to provide vectorized CPU performance.

The `/cpu/self/sve/*` backends rely upon SVE instructions to provide vectorized CPU performance.

The `/cpu/self/memcheck/*` backends rely upon the [Valgrind](http://valgrind.org/) Memcheck tool to help verify that user QFunctions have no undefined values.
To use, run your code with Valgrind and the Memcheck backends, e.g. `valgrind ./build/ex1 -ceed /cpu/self/ref/memcheck`.
A 'development' or 'debugging' version of Valgrind with headers is required to use this backend.
Expand Down
2 changes: 2 additions & 0 deletions backends/ceed-backend-list.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,7 @@ MACRO(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
MACRO(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
MACRO(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
MACRO(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
MACRO(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
MACRO(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
MACRO(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
MACRO(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
45 changes: 45 additions & 0 deletions backends/sve/ceed-sve-blocked.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed/backend.h>
#include <ceed/ceed.h>
#include <stdbool.h>
#include <string.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/sve") && strcmp(resource, "/cpu/self/sve/blocked")) {
// LCOV_EXCL_START
return CeedError(ceed, CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
// LCOV_EXCL_STOP
}
CeedCallBackend(CeedSetDeterministic(ceed, true));

// Create reference CEED that implementation will be dispatched
// through unless overridden
Ceed ceed_ref;
CeedInit("/cpu/self/opt/blocked", &ceed_ref);
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
} else {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve);
}

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Blocked(void) { return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30); }
//------------------------------------------------------------------------------
45 changes: 45 additions & 0 deletions backends/sve/ceed-sve-serial.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed/backend.h>
#include <ceed/ceed.h>
#include <stdbool.h>
#include <string.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/sve/serial")) {
// LCOV_EXCL_START
return CeedError(ceed, CEED_ERROR_BACKEND, "SVE backend cannot use resource: %s", resource);
// LCOV_EXCL_STOP
}
CeedCallBackend(CeedSetDeterministic(ceed, true));

// Create reference CEED that implementation will be dispatched
// through unless overridden
Ceed ceed_ref;
CeedInit("/cpu/self/opt/serial", &ceed_ref);
CeedCallBackend(CeedSetDelegate(ceed, ceed_ref));

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f64_Sve));
} else {
CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate", CeedTensorContractCreate_f32_Sve));
}

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Serial(void) { return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35); }
//------------------------------------------------------------------------------
150 changes: 150 additions & 0 deletions backends/sve/ceed-sve-tensor-f32.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED: http://github.com/ceed

#include <ceed/backend.h>
#include <ceed/ceed.h>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
#include <stdbool.h>

#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;
if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1;
t_stride_1 = J;
}

for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
// Blocks of JJ rows
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
// C vectorization by compiler
for (int32_t c = 0; c < C; c += svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
// Basis matrix value
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
// fmadd
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
}
}
}
}
}

// Remainder of rows
CeedInt j = (J / JJ) * JJ;
if (j < J) {
for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
// Blocks of JJ rows
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
// C vectorization by compiler
for (int32_t c = 0; c < C; c += svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a * B + b) * C + c]);
svfloat32_t v_vec = svld1(pg, &v[(a * J + j + jj) * C + c]);
// Basis matrix value
float tq = t[(j + jj) * t_stride_0 + b * t_stride_1];
// fmadd
svst1(pg, &v[(a * J + j + jj) * C + c], svmla_x(pg, v_vec, u_vec, tq));
}
}
}
}
}

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v,
const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;
if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1;
t_stride_1 = J;
}

for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) {
for (CeedInt jj = 0; jj < JJ; jj++) { // unroll
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
}
}
}
}

CeedInt j = (J / JJ) * JJ;
if (j < J) {
for (CeedInt a = 0; a < A; a++) {
for (CeedInt b = 0; b < B; b++) {
for (CeedInt jj = 0; jj < J - j; jj++) { // not unrolled
v[a * J + (j + jj)] += t[(j + jj) * t_stride_0 + b * t_stride_1] * u[a * B + b];
}
}
}
}

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract - Common Sizes
//------------------------------------------------------------------------------
static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u, v, 8);
}
static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v, 8);
}

//------------------------------------------------------------------------------
// Tensor Contract Apply
//------------------------------------------------------------------------------
static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u, float *restrict v) {
if (!add) {
for (CeedInt q = 0; q < A * J * C; q++) v[q] = (float)0.0;
}

if (C == 1) CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
else CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract Create
//------------------------------------------------------------------------------
int CeedTensorContractCreate_f32_Sve(CeedBasis basis, CeedTensorContract contract) {
Ceed ceed;
CeedCallBackend(CeedTensorContractGetCeed(contract, &ceed));

CeedCallBackend(CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply", CeedTensorContractApply_Sve));

return CEED_ERROR_SUCCESS;
}
//------------------------------------------------------------------------------
Loading

0 comments on commit 2ddda2a

Please sign in to comment.