Skip to content

Commit

Permalink
sve - initial SVE backend framework
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Aug 30, 2022
1 parent 01005ea commit 9fdef95
Show file tree
Hide file tree
Showing 8 changed files with 485 additions and 2 deletions.
17 changes: 15 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ blocked.c := $(sort $(wildcard backends/blocked/*.c))
ceedmemcheck.c := $(sort $(wildcard backends/memcheck/*.c))
opt.c := $(sort $(wildcard backends/opt/*.c))
avx.c := $(sort $(wildcard backends/avx/*.c))
sve.c := $(sort $(wildcard backends/sve/*.c))
xsmm.c := $(sort $(wildcard backends/xsmm/*.c))
cuda.c := $(sort $(wildcard backends/cuda/*.c))
cuda.cpp := $(sort $(wildcard backends/cuda/*.cpp))
Expand Down Expand Up @@ -286,6 +287,7 @@ info:
$(info ------------------------------------)
$(info MEMCHK_STATUS = $(MEMCHK_STATUS)$(call backend_status,$(MEMCHK_BACKENDS)))
$(info AVX_STATUS = $(AVX_STATUS)$(call backend_status,$(AVX_BACKENDS)))
$(info SVE_STATUS = $(SVE_STATUS)$(call backend_status,$(SVE_BACKENDS)))
$(info XSMM_DIR = $(XSMM_DIR)$(call backend_status,$(XSMM_BACKENDS)))
$(info OCCA_DIR = $(OCCA_DIR)$(call backend_status,$(OCCA_BACKENDS)))
$(info MAGMA_DIR = $(MAGMA_DIR)$(call backend_status,$(MAGMA_BACKENDS)))
Expand Down Expand Up @@ -326,7 +328,7 @@ ifeq ($(MEMCHK),1)
BACKENDS_MAKE += $(MEMCHK_BACKENDS)
endif

# AVX Backed
# AVX Backends
AVX_STATUS = Disabled
AVX_FLAG := $(if $(filter clang,$(CC_VENDOR)),+avx,-mavx)
AVX := $(filter $(AVX_FLAG),$(shell $(CC) $(CFLAGS) -v -E -x c /dev/null 2>&1))
Expand All @@ -337,6 +339,17 @@ ifneq ($(AVX),)
BACKENDS_MAKE += $(AVX_BACKENDS)
endif

# SVE Backends
SVE_STATUS = Disabled
SVE_FLAG := $(if $(filter clang,$(CC_VENDOR)),+sve,-msve)
SVE ?=
SVE_BACKENDS = /cpu/self/sve/serial /cpu/self/sve/blocked
ifneq ($(SVE),)
SVE_STATUS = Enabled
libceed.c += $(sve.c)
BACKENDS_MAKE += $(SVE_BACKENDS)
endif

# Collect list of libraries and paths for use in linking and pkg-config
PKG_LIBS =
# Stubs that will not be RPATH'd
Expand Down Expand Up @@ -416,7 +429,7 @@ ifneq ($(HIP_LIB_DIR),)
BACKENDS_MAKE += $(HIP_BACKENDS)
endif

# MAGMA Backend
# MAGMA Backends
ifneq ($(wildcard $(MAGMA_DIR)/lib/libmagma.*),)
MAGMA_ARCH=$(shell nm -g $(MAGMA_DIR)/lib/libmagma.* | grep -c "hipblas")
ifeq ($(MAGMA_ARCH), 0) #CUDA MAGMA
Expand Down
2 changes: 2 additions & 0 deletions backends/ceed-backend-list.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,7 @@ MACRO(CeedRegister_Opt_Blocked, 1, "/cpu/self/opt/blocked")
MACRO(CeedRegister_Opt_Serial, 1, "/cpu/self/opt/serial")
MACRO(CeedRegister_Ref, 1, "/cpu/self/ref/serial")
MACRO(CeedRegister_Ref_Blocked, 1, "/cpu/self/ref/blocked")
MACRO(CeedRegister_Sve_Serial, 1, "/cpu/self/sve/serial")
MACRO(CeedRegister_Sve_Blocked, 1, "/cpu/self/sve/blocked")
MACRO(CeedRegister_Xsmm_Blocked, 1, "/cpu/self/xsmm/blocked")
MACRO(CeedRegister_Xsmm_Serial, 1, "/cpu/self/xsmm/serial")
2 changes: 2 additions & 0 deletions backends/opt/ceed-opt-blocked.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ static int CeedInit_Opt_Blocked(const char *resource, Ceed ceed) {

ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy",
CeedDestroy_Opt); CeedChkBackend(ierr);
ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate",
CeedTensorContractCreate_Opt); CeedChkBackend(ierr);
ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate",
CeedOperatorCreate_Opt); CeedChkBackend(ierr);

Expand Down
61 changes: 61 additions & 0 deletions backends/sve/ceed-sve-blocked.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
// All Rights reserved. See files LICENSE and NOTICE for details.
//
// This file is part of CEED, a collection of benchmarks, miniapps, software
// libraries and APIs for efficient high-order finite element and spectral
// element discretizations for exascale applications. For more information and
// source code availability see http://github.com/ceed.
//
// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
// a collaborative effort of two U.S. Department of Energy organizations (Office
// of Science and the National Nuclear Security Administration) responsible for
// the planning and preparation of a capable exascale ecosystem, including
// software, applications, hardware, advanced system engineering and early
// testbed platforms, in support of the nation's exascale computing imperative.

#include <ceed/ceed.h>
#include <ceed/backend.h>
#include <stdbool.h>
#include <string.h>
#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
int ierr;
if (strcmp(resource, "/cpu/self") && strcmp(resource, "/cpu/self/sve") &&
strcmp(resource, "/cpu/self/sve/blocked"))
// LCOV_EXCL_START
return CeedError(ceed, CEED_ERROR_BACKEND,
"SVE backend cannot use resource: %s", resource);
// LCOV_EXCL_STOP
ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr);

// Create reference CEED that implementation will be dispatched
// through unless overridden
Ceed ceed_ref;
CeedInit("/cpu/self/opt/blocked", &ceed_ref);
ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr);

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate",
CeedTensorContractCreate_f64_Sve);
CeedChkBackend(ierr);
} else {
ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate",
CeedTensorContractCreate_f32_Sve);
CeedChkBackend(ierr);
}

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Blocked(void) {
return CeedRegister("/cpu/self/sve/blocked", CeedInit_Sve, 30);
}
//------------------------------------------------------------------------------
61 changes: 61 additions & 0 deletions backends/sve/ceed-sve-serial.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
// All Rights reserved. See files LICENSE and NOTICE for details.
//
// This file is part of CEED, a collection of benchmarks, miniapps, software
// libraries and APIs for efficient high-order finite element and spectral
// element discretizations for exascale applications. For more information and
// source code availability see http://github.com/ceed.
//
// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
// a collaborative effort of two U.S. Department of Energy organizations (Office
// of Science and the National Nuclear Security Administration) responsible for
// the planning and preparation of a capable exascale ecosystem, including
// software, applications, hardware, advanced system engineering and early
// testbed platforms, in support of the nation's exascale computing imperative.

#include <ceed/ceed.h>
#include <ceed/backend.h>
#include <stdbool.h>
#include <string.h>
#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Backend Init
//------------------------------------------------------------------------------
static int CeedInit_Sve(const char *resource, Ceed ceed) {
int ierr;
if (strcmp(resource, "/cpu/self")
&& strcmp(resource, "/cpu/self/sve/serial"))
// LCOV_EXCL_START
return CeedError(ceed, CEED_ERROR_BACKEND,
"SVE backend cannot use resource: %s", resource);
// LCOV_EXCL_STOP
ierr = CeedSetDeterministic(ceed, true); CeedChkBackend(ierr);

// Create reference CEED that implementation will be dispatched
// through unless overridden
Ceed ceed_ref;
CeedInit("/cpu/self/opt/serial", &ceed_ref);
ierr = CeedSetDelegate(ceed, ceed_ref); CeedChkBackend(ierr);

if (CEED_SCALAR_TYPE == CEED_SCALAR_FP64) {
ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate",
CeedTensorContractCreate_f64_Sve);
CeedChkBackend(ierr);
} else {
ierr = CeedSetBackendFunction(ceed, "Ceed", ceed, "TensorContractCreate",
CeedTensorContractCreate_f32_Sve);
CeedChkBackend(ierr);
}

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Backend Register
//------------------------------------------------------------------------------
CEED_INTERN int CeedRegister_Sve_Serial(void) {
return CeedRegister("/cpu/self/sve/serial", CeedInit_Sve, 35);
}
//------------------------------------------------------------------------------
158 changes: 158 additions & 0 deletions backends/sve/ceed-sve-tensor-f32.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
// Copyright (c) 2017-2018, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory. LLNL-CODE-734707.
// All Rights reserved. See files LICENSE and NOTICE for details.
//
// This file is part of CEED, a collection of benchmarks, miniapps, software
// libraries and APIs for efficient high-order finite element and spectral
// element discretizations for exascale applications. For more information and
// source code availability see http://github.com/ceed.
//
// The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
// a collaborative effort of two U.S. Department of Energy organizations (Office
// of Science and the National Nuclear Security Administration) responsible for
// the planning and preparation of a capable exascale ecosystem, including
// software, applications, hardware, advanced system engineering and early
// testbed platforms, in support of the nation's exascale computing imperative.

#include <ceed/ceed.h>
#include <ceed/backend.h>
#ifdef __ARM_FEATURE_SVE
#include <arm_sve.h>
#endif
#include <stdbool.h>
#include "ceed-sve.h"

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Blocked(CeedTensorContract contract,
CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u,
float *restrict v, const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;
if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1; t_stride_1 = J;
}

for (CeedInt a=0; a<A; a++)
for (CeedInt b=0; b<B; b++)
// Blocks of JJ rows
for (CeedInt j=0; j<(J/JJ)*JJ; j+=JJ)
for (CeedInt jj=0; jj<JJ; jj++) // unroll
// C vectorization by compiler
for (int32_t c=0; c<C; c+=svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a*B+b)*C+c]);
svfloat32_t v_vec = svld1(pg, &v[(a*J+j+jj)*C+c]);
// Basis matrix value
float tq = t[(j+jj)*t_stride_0 + b*t_stride_1];
// fmadd
svst1(pg, &v[(a*J+j+jj)*C+c], svmla_x(pg, v_vec, u_vec, tq));
}

// Remainder of rows
CeedInt j=(J/JJ)*JJ;
if (j < J)
for (CeedInt a=0; a<A; a++)
for (CeedInt b=0; b<B; b++)
// Blocks of JJ rows
for (CeedInt jj=0; jj<J-j; jj++) // not unrolled
// C vectorization by compiler
for (int32_t c=0; c<C; c+=svcntd()) {
svbool_t pg = svwhilelt_b32(c, C);
// Load u, v into vectors
svfloat32_t u_vec = svld1(pg, &u[(a*B+b)*C+c]);
svfloat32_t v_vec = svld1(pg, &v[(a*J+j+jj)*C+c]);
// Basis matrix value
float tq = t[(j+jj)*t_stride_0 + b*t_stride_1];
// fmadd
svst1(pg, &v[(a*J+j+jj)*C+c], svmla_x(pg, v_vec, u_vec, tq));
}

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Blocked Tensor Contract
//------------------------------------------------------------------------------
static inline int CeedTensorContract_Sve_Serial(CeedTensorContract contract,
CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u,
float *restrict v, const CeedInt JJ) {
CeedInt t_stride_0 = B, t_stride_1 = 1;
if (t_mode == CEED_TRANSPOSE) {
t_stride_0 = 1; t_stride_1 = J;
}

for (CeedInt a=0; a<A; a++)
for (CeedInt b=0; b<B; b++)
for (CeedInt j=0; j<(J/JJ)*JJ; j+=JJ)
for (CeedInt jj=0; jj<JJ; jj++) // unroll
v[a*J+(j+jj)] += t[(j+jj)*t_stride_0 + b*t_stride_1] * u[a*B+b];

CeedInt j=(J/JJ)*JJ;
if (j < J)
for (CeedInt a=0; a<A; a++)
for (CeedInt b=0; b<B; b++)
for (CeedInt jj=0; jj<J-j; jj++) // not unrolled
v[a*J+(j+jj)] += t[(j+jj)*t_stride_0 + b*t_stride_1] * u[a*B+b];

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract - Common Sizes
//------------------------------------------------------------------------------
static int CeedTensorContract_Sve_Blocked_8(CeedTensorContract contract,
CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u,
float *restrict v) {
return CeedTensorContract_Sve_Blocked(contract, A, B, C, J, t, t_mode, add, u,
v, 8);
}
static int CeedTensorContract_Sve_Serial_8(CeedTensorContract contract,
CeedInt A, CeedInt B, CeedInt C, CeedInt J, const float *restrict t,
CeedTransposeMode t_mode, const CeedInt add, const float *restrict u,
float *restrict v) {
return CeedTensorContract_Sve_Serial(contract, A, B, C, J, t, t_mode, add, u, v,
8);
}

//------------------------------------------------------------------------------
// Tensor Contract Apply
//------------------------------------------------------------------------------
static int CeedTensorContractApply_Sve(CeedTensorContract contract, CeedInt A,
CeedInt B, CeedInt C, CeedInt J,
const float *restrict t,
CeedTransposeMode t_mode,
const CeedInt add,
const float *restrict u,
float *restrict v) {
if (!add)
for (CeedInt q=0; q<A*J*C; q++)
v[q] = (float) 0.0;

if (C == 1)
CeedTensorContract_Sve_Serial_8(contract, A, B, C, J, t, t_mode, true, u, v);
else
CeedTensorContract_Sve_Blocked_8(contract, A, B, C, J, t, t_mode, true, u, v);

return CEED_ERROR_SUCCESS;
}

//------------------------------------------------------------------------------
// Tensor Contract Create
//------------------------------------------------------------------------------
int CeedTensorContractCreate_f32_Sve(CeedBasis basis,
CeedTensorContract contract) {
int ierr;
Ceed ceed;
ierr = CeedTensorContractGetCeed(contract, &ceed); CeedChkBackend(ierr);

ierr = CeedSetBackendFunction(ceed, "TensorContract", contract, "Apply",
CeedTensorContractApply_Sve); CeedChkBackend(ierr);

return CEED_ERROR_SUCCESS;
}
//------------------------------------------------------------------------------
Loading

0 comments on commit 9fdef95

Please sign in to comment.